Whamcloud - gitweb
land 0.5.20.3 b_devel onto HEAD (b_devel will remain)
authorpschwan <pschwan>
Sun, 2 Mar 2003 05:23:24 +0000 (05:23 +0000)
committerpschwan <pschwan>
Sun, 2 Mar 2003 05:23:24 +0000 (05:23 +0000)
163 files changed:
lustre/ChangeLog
lustre/Makefile.am
lustre/Rules
lustre/archdep.m4
lustre/autogen.sh
lustre/cobd/cache_obd.c
lustre/cobd/lproc_cache.c
lustre/conf/lustre.dtd
lustre/conf/lustre2ldif.xsl
lustre/conf/top.ldif
lustre/configure.in
lustre/extN/Makefile.am
lustre/extN/ext3-2.4-ino_t.diff [new file with mode: 0644]
lustre/extN/ext3-2.4.18-ino_sb_macro.diff
lustre/extN/ext3-largefile.diff [new file with mode: 0644]
lustre/extN/extN-san.diff [new file with mode: 0644]
lustre/include/liblustre.h [new file with mode: 0644]
lustre/include/linux/lprocfs_status.h
lustre/include/linux/lustre_dlm.h
lustre/include/linux/lustre_export.h
lustre/include/linux/lustre_fsfilt.h
lustre/include/linux/lustre_handles.h [new file with mode: 0644]
lustre/include/linux/lustre_idl.h
lustre/include/linux/lustre_import.h
lustre/include/linux/lustre_lib.h
lustre/include/linux/lustre_lite.h
lustre/include/linux/lustre_mds.h
lustre/include/linux/lustre_net.h
lustre/include/linux/obd.h
lustre/include/linux/obd_class.h
lustre/include/linux/obd_filter.h
lustre/include/linux/obd_lov.h
lustre/include/linux/obd_ost.h
lustre/include/linux/obd_support.h
lustre/kernel_patches/patches/iod-rmap-exports.patch
lustre/kernel_patches/patches/lustre-2.5.patch
lustre/kernel_patches/patches/lustre_version.patch
lustre/kernel_patches/patches/vfs_intent-2.4.18-18.patch
lustre/kernel_patches/patches/vfs_intent_hp.patch
lustre/kernel_patches/pc/lustre-2.5.pc [new file with mode: 0644]
lustre/kernel_patches/pc/vfs_intent-2.4.18-18.pc
lustre/kernel_patches/series/vanilla-2.5 [new file with mode: 0644]
lustre/kernel_patches/which_patch
lustre/ldlm/Makefile.am
lustre/ldlm/l_lock.c
lustre/ldlm/ldlm_extent.c
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/ldlm/ldlm_resource.c
lustre/lib/client.c
lustre/lib/mds_updates.c
lustre/lib/obd_pack.c
lustre/lib/simple.c
lustre/lib/target.c
lustre/liblustre/.cvsignore [new file with mode: 0644]
lustre/liblustre/Makefile.am [new file with mode: 0644]
lustre/liblustre/libtest.c [new file with mode: 0644]
lustre/llite/commit_callback.c
lustre/llite/dcache.c
lustre/llite/dir.c
lustre/llite/file.c
lustre/llite/lproc_llite.c
lustre/llite/namei.c
lustre/llite/rw.c
lustre/llite/super.c
lustre/llite/super25.c
lustre/llite/symlink.c
lustre/llite/sysctl.c
lustre/lov/Makefile.am
lustre/lov/lov_obd.c
lustre/lov/lov_pack.c
lustre/lov/lproc_lov.c
lustre/mdc/lproc_mdc.c
lustre/mdc/mdc_reint.c
lustre/mdc/mdc_request.c
lustre/mds/handler.c
lustre/mds/lproc_mds.c
lustre/mds/mds_fs.c
lustre/mds/mds_lov.c
lustre/mds/mds_open.c
lustre/mds/mds_reint.c
lustre/obdclass/Makefile.am
lustre/obdclass/class_obd.c
lustre/obdclass/debug.c
lustre/obdclass/fsfilt_ext3.c
lustre/obdclass/fsfilt_extN.c
lustre/obdclass/fsfilt_reiserfs.c
lustre/obdclass/genops.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/lustre_handles.c [new file with mode: 0644]
lustre/obdclass/lustre_peer.c [new file with mode: 0644]
lustre/obdclass/statfs_pack.c
lustre/obdclass/sysctl.c
lustre/obdclass/uuid.c
lustre/obdecho/Makefile.am
lustre/obdecho/echo.c
lustre/obdecho/echo_client.c
lustre/obdecho/lproc_echo.c
lustre/obdfilter/Makefile.am
lustre/obdfilter/filter.c
lustre/obdfilter/lproc_obdfilter.c
lustre/osc/Makefile.am
lustre/osc/lproc_osc.c
lustre/osc/osc_request.c
lustre/ost/ost_handler.c
lustre/ptlbd/client.c
lustre/ptlbd/rpc.c
lustre/ptlbd/server.c
lustre/ptlrpc/Makefile.am
lustre/ptlrpc/client.c
lustre/ptlrpc/connection.c
lustre/ptlrpc/events.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/recovd.c
lustre/ptlrpc/recover.c
lustre/ptlrpc/rpc.c
lustre/ptlrpc/service.c
lustre/scripts/lustre.spec.in
lustre/tests/.cvsignore
lustre/tests/Makefile.am
lustre/tests/acceptance-metadata-double.sh [new file with mode: 0644]
lustre/tests/acceptance-metadata-single.sh
lustre/tests/acceptance-small.sh
lustre/tests/ba-echo.sh
lustre/tests/ba-mount.sh
lustre/tests/compile.sh [new file with mode: 0644]
lustre/tests/directio.c
lustre/tests/lkcdmap
lustre/tests/llmount.sh
lustre/tests/local.sh
lustre/tests/mcr-individual-ost-nogw-config.sh
lustre/tests/mcr-mds-failover-config.sh
lustre/tests/mcr-routed-config.sh
lustre/tests/mcr.sh
lustre/tests/mcrlov.sh
lustre/tests/mlink.c [new file with mode: 0755]
lustre/tests/mount2.sh
lustre/tests/open_delay.c
lustre/tests/recovery-cleanup.sh [new file with mode: 0755]
lustre/tests/recovery-small.sh
lustre/tests/runiozone
lustre/tests/runtests
lustre/tests/sanity.sh
lustre/tests/statmany.c
lustre/tests/statone.c [new file with mode: 0644]
lustre/tests/tchmod.c
lustre/tests/testreq.c
lustre/tests/uml.sh
lustre/tests/wantedi.c
lustre/utils/.cvsignore
lustre/utils/Makefile.am
lustre/utils/lconf.in
lustre/utils/lctl.c
lustre/utils/lfind.c
lustre/utils/lmc
lustre/utils/obd.c
lustre/utils/obdbarrier.c
lustre/utils/obdctl.h
lustre/utils/obdio.c
lustre/utils/obdiolib.c
lustre/utils/obdstat.c

index 05209b5..61193c7 100644 (file)
@@ -1,8 +1,25 @@
 TBD
-       * version v0_5_21
-       * bug fixes
-        - workaround for gcc 3.2, which has macro-argument issues (850)
-        - lmc/lconf syntax change for OST UUIDs
+       * version v0_5_21
+       * bug fixes
+       - LDLM_DEBUG macro fix, for gcc 3.2 (850)
+       - failed open()s could cause deadlock; fixed (867, 869)
+       - stop cancelling OST locks when files are closed (481)
+       - overlapping XID spaces caused network corruption (851, 853)
+       - fix unsafe fsfilt counter arithmetic; change to atomic_t
+       - setattr_raw added, to do single-RPC, server-side setattrs
+       - lmc/lconf syntax change for OST UUIDs
+       - fix crashy race condition between ptlrpc_free_req and osc_close
+       - don't use request in mdc_enqueue if we hit a timeout (889)
+       - don't set the inode i_size for regular files from the MDS (896)
+       - handle out of order completion AST (842)
+       - don't LBUG if a lock request times out after receiving AST (913)
+       - avoid d_rehash race in ll_find_alias by rehashing inside dcache_lock
+       - if a bad lock AST arrives, send an error instead of dropping entirely
+       - return 0 from revalidate2 if ll_intent_lock returns -EINTR (912)
+       - fix leak in bulk IO when only partially completed (899, 900, 926)
+       * protocol changes
+       - READPAGE and SETATTRs which don't take server-side locks get
+         their own portal
 
 2003-02-11  Phil Schwan  <phil@clusterfs.com>
        * version v0_5_20
@@ -31,7 +48,7 @@ TBD
         - client verifies file size before zeroing page past EOF (445)
         - OST now writes last allocated objid to disk with allocation (108)
         - LOV on echo now works (409)
-        * protocol changes
+       * protocol changes
         - mds_reint_unlink sends a new buffer, with the EA included.  this
           buffer is only valid if body->valid & OBD_MD_FLEASIZE, which is only
           set if a regular file was being unlinked, and it was the last link
index 3edac96..f16f126 100644 (file)
@@ -6,14 +6,20 @@
 AUTOMAKE_OPTIONS = foreign
 
 if LINUX25
-DIRS24 = mds
+DIRS24 = 
 else
-DIRS24 = extN mds
+DIRS24 = extN ptlbd
 endif
 
-# NOTE: keep extN before mds and obdfilter
-SUBDIRS = $(DIRS24) obdclass utils ptlrpc ldlm lib obdfilter mdc osc ost llite
-SUBDIRS+= obdecho lov cobd ptlbd tests doc scripts conf
+if LIBLUSTRE
+SUBDIRS = lov obdclass ptlrpc obdecho ldlm osc liblustre utils
+else
+# NOTE: keep extN before obdclass, mds, and obdfilter.  Keep obdclass as early
+# as possible, to have the best chance at stopping with "wrong kernel version"
+# instead of some related build failure.
+SUBDIRS = $(DIRS24) obdclass mds utils ptlrpc ldlm lib obdfilter mdc osc ost
+SUBDIRS+= llite obdecho lov cobd tests doc scripts conf
+endif
 
 DIST_SUBDIRS = $(SUBDIRS)
 EXTRA_DIST = BUGS FDL Rules include archdep.m4 kernel_patches
index 069e89a..d4e5ed7 100644 (file)
@@ -10,6 +10,7 @@
 #  name_SOURCES = my.c files.c
 #  include $(top_srcdir)/Rules
 
+
 $(MODULE).o: $($(MODULE)_OBJECTS)
        $(LD) -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r -o $(MODULE).o $($(MODULE)_OBJECTS)
 
index 58a6576..2bdd785 100644 (file)
@@ -1,4 +1,10 @@
+AC_ARG_WITH(lib, [  --with-lib compile lustre library], host_cpu="lib")
+
 AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...)
+if test $host_cpu = "lib" ; then 
+        host_cpu="lib"
+       AC_MSG_RESULT(no building Lustre library)
+else
 if test -e $LINUX/include/asm-um ; then
 if test  X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then
        host_cpu="um";
@@ -10,19 +16,41 @@ fi
 else 
         AC_MSG_RESULT(no (asm-um missing))
 fi
+fi
 
 AC_MSG_CHECKING(setting make flags system architecture: )
 case ${host_cpu} in
+       lib )
+       AC_MSG_RESULT($host_cpu)
+       KCFLAGS='-g -Wall '
+       KCPPFLAGS='-D__arch_lib__ '
+        MOD_LINK=elf_i386
+;;
        um )
        AC_MSG_RESULT($host_cpu)
        KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common '
-       KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include '
+        case ${linux25} in
+                yes )
+                KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) '
+        ;;
+                * )
+               KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include '
+       ;;
+       esac
+
         MOD_LINK=elf_i386
 ;;
        i*86 )
        AC_MSG_RESULT($host_cpu)
         KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe'
-        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        case ${linux25} in
+                yes )
+               KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include ' 
+        ;;
+                * )
+               KCPPFLAGS='-D__KERNEL__ -DMODULE '
+       ;;
+       esac
         MOD_LINK=elf_i386
 ;;
 
@@ -74,6 +102,7 @@ case ${host_cpu} in
 ;;
 esac
 
+if test $host_cpu != lib ; then 
 AC_MSG_CHECKING(for MODVERSIONS)
 if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1;
 then
@@ -92,6 +121,7 @@ else
        SMPFLAG=
        AC_MSG_RESULT(no)
 fi
+fi
 
 CFLAGS="$KCFLAGS $MFLAGS"
 ARCHCPPFLAGS="$KCPPFLAGS"
index 9accad4..087ff09 100644 (file)
@@ -2,5 +2,5 @@
 
 find . -type d -name .deps | xargs rm -rf
 aclocal &&
-automake --add-missing &&
+${AUTOMAKE:-automake} --add-missing &&
 ${AUTOCONF:-autoconf}
index 72a05cc..67b4e62 100644 (file)
 
 #define DEBUG_SUBSYSTEM S_COBD
 
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#include <linux/init.h>
+#endif
 #include <linux/obd_support.h>
 #include <linux/lustre_lib.h>
 #include <linux/lustre_net.h>
index 5170829..7e5c267 100644 (file)
@@ -33,16 +33,20 @@ static int rd_target(char *page, char **start, off_t off, int count,
                      int *eof, void *data)
 {
         struct obd_device    *dev = (struct obd_device*)data;
-       struct lustre_handle *conn = &dev->u.cobd.cobd_target;
+       struct lustre_handle *conn;
        struct obd_export    *exp;
        int    rc;
 
+        LASSERT(dev != NULL);
+        conn = &dev->u.cobd.cobd_target;
+
        if ((dev->obd_flags & OBD_SET_UP) == 0)
                rc = snprintf (page, count, "not set up\n");
        else {
                exp = class_conn2export (conn);
                LASSERT(exp != NULL);
-               rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid.uuid);
+               rc = snprintf(page, count, "%s\n", 
+                              exp->exp_obd->obd_uuid.uuid);
        }
        return (rc);
 }
@@ -51,16 +55,20 @@ static int rd_cache(char *page, char **start, off_t off, int count,
                     int *eof, void *data)
 {
         struct obd_device    *dev = (struct obd_device*)data;
-       struct lustre_handle *conn = &dev->u.cobd.cobd_cache;
+       struct lustre_handle *conn;
        struct obd_export    *exp;
        int    rc;
 
+        LASSERT(dev != NULL);
+        conn = &dev->u.cobd.cobd_cache;
+
        if ((dev->obd_flags & OBD_SET_UP) == 0)
                rc = snprintf (page, count, "not set up\n");
        else {
                exp = class_conn2export (conn);
                LASSERT (exp != NULL);
-               rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid.uuid);
+               rc = snprintf(page, count, "%s\n", 
+                              exp->exp_obd->obd_uuid.uuid);
        }
        return (rc);
 }
index 73f7c95..8d575a6 100644 (file)
 <!ENTITY % objref.attr    "uuidref CDATA #REQUIRED">
 
 <!-- main elements -->
-<!ELEMENT lustre (node | profile | mountpoint | ldlm | echoclient |
-                  mds | obd | ost | lov | lovconfig)*>
+<!ELEMENT lustre (node | profile | mountpoint | ldlm | ptlrpc |echoclient |
+                  mds | mdsdev| ost | osd | lov | lovconfig)*>
 
-<!ELEMENT node (network | profile_ref)*>
+<!ELEMENT node (network | routetbl | profile_ref)*>
 <!ATTLIST node %object.attr;
                router CDATA #IMPLIED>
                
@@ -29,9 +29,9 @@
                 lo CDATA #REQUIRED
                 hi CDATA #IMPLIED >
 
-<!ELEMENT profile (ldlm_ref | network_ref | obd_ref | ost_ref |
-                   echoclient_ref | mdsdev_ref | lov_ref |
-                   lovconfig_ref| mountpoint_ref)*>
+<!ELEMENT profile (ldlm_ref | ptlrpc_ref | network_ref | routetbl_ref |
+                   osd_ref | mdsdev_ref | lovconfig_ref|
+                   echoclient_ref | mountpoint_ref)*>
 <!ATTLIST profile %object.attr;>
 
 <!ELEMENT mountpoint (path | fileset | mds_ref | obd_ref)*>
 <!ELEMENT ldlm EMPTY>
 <!ATTLIST ldlm %object.attr;>
 
-<!ELEMENT obd (fstype | devpath | devsize | autoformat | active_ref)*>
-<!ATTLIST obd %object.attr; 
-              obdtype (obdfilter | obdecho) 'obdfilter'>
+<!ELEMENT ptlrpc EMPTY>
+<!ATTLIST ptlrpc %object.attr;>
 
-<!ELEMENT ost (network_ref | obd_ref | failover_ref)*>
+<!ELEMENT osd (fstype | devpath | devsize | autoformat | 
+               target_ref | node_ref)*>
+<!ATTLIST osd %object.attr; 
+              osdtype (obdfilter | obdecho) 'obdfilter'>
+
+<!ELEMENT ost (active_ref)*>
 <!ATTLIST ost %object.attr;>
 
-<!ELEMENT mds (active_ref)*>
+<!ELEMENT mds (active_ref | lovconfig_ref)*>
 <!ATTLIST mds %object.attr;>
 
 <!ELEMENT mdsdev (fstype | devpath | devsize | autoformat | 
-                  mds_ref | network_ref )*>
+                  target_ref | node_ref )*>
 <!ATTLIST mdsdev %object.attr;>
 
 <!ELEMENT lov (mds_ref |(obd_ref)+)*>
@@ -71,8 +75,8 @@
 <!ELEMENT fstype        %object.content;>
 <!ELEMENT nid           %object.content;>
 <!ELEMENT port          %object.content;>
-<!ELEMENT send_mem      %object.content;>
-<!ELEMENT recv_mem      %object.content;>
+<!ELEMENT sendmem      %object.content;>
+<!ELEMENT recvmem      %object.content;>
 <!ELEMENT autoformat    %object.content;>
 <!ELEMENT activetarget  %object.content;>
 <!ELEMENT devpath       %object.content;>
 <!-- object reference tag elements -->
 <!ELEMENT network_ref     %objref.content;>
 <!ATTLIST network_ref     %objref.attr;>
+<!ELEMENT routetbl_ref    %objref.content;>
+<!ATTLIST routetbl_ref    %objref.attr;>
 <!ELEMENT node_ref        %objref.content;>
 <!ATTLIST node_ref        %objref.attr;>
 <!ELEMENT profile_ref     %objref.content;>
 <!ATTLIST profile_ref     %objref.attr;>
-<!ELEMENT obd_ref         %objref.content;>
-<!ATTLIST obd_ref         %objref.attr;>
+<!ELEMENT osd_ref         %objref.content;>
+<!ATTLIST osd_ref         %objref.attr;>
 <!ELEMENT mds_ref         %objref.content;>
 <!ATTLIST mds_ref         %objref.attr;>
 <!ELEMENT mdsdev_ref      %objref.content;>
 <!ATTLIST mdsdev_ref      %objref.attr;>
+<!ELEMENT obd_ref         %objref.content;>
+<!ATTLIST obd_ref         %objref.attr;>
 <!ELEMENT ost_ref         %objref.content;>
 <!ATTLIST ost_ref         %objref.attr;>
+<!ELEMENT active_ref         %objref.content;>
+<!ATTLIST active_ref         %objref.attr;>
+<!ELEMENT target_ref         %objref.content;>
+<!ATTLIST target_ref         %objref.attr;>
 <!ELEMENT lov_ref         %objref.content;>
 <!ATTLIST lov_ref         %objref.attr;>
 <!ELEMENT lovconfig_ref   %objref.content;>
 <!ATTLIST lovconfig_ref   %objref.attr;>
 <!ELEMENT mountpoint_ref  %objref.content;>
 <!ATTLIST mountpoint_ref  %objref.attr;>
-<!ELEMENT echoclient_ref %objref.content;>
-<!ATTLIST echoclient_ref %objref.attr;>
+<!ELEMENT echoclient_ref  %objref.content;>
+<!ATTLIST echoclient_ref  %objref.attr;>
 <!ELEMENT failover_ref    %objref.content;>
 <!ATTLIST failover_ref    %objref.attr;>
 <!ELEMENT ldlm_ref        %objref.content;>
 <!ATTLIST ldlm_ref        %objref.attr;>
+<!ELEMENT ptlrpc_ref      %objref.content;>
+<!ATTLIST ptlrpc_ref      %objref.attr;>
 
 
index c7ea957..f3c1364 100644 (file)
@@ -76,8 +76,8 @@ devpath: <value-of select="devpath"/>
 <if test="devsize">
 devsize: <value-of select="devsize"/>
 </if>
-networkRef: <value-of select="network_ref/@uuidref"/>
-mdsRef: <value-of select="mds_ref/@uuidref"/>
+nodeRef: <value-of select="node_ref/@uuidref"/>
+targetRef: <value-of select="target_ref/@uuidref"/>
 <text>
 </text>
 </template>
@@ -104,13 +104,14 @@ uuid: <value-of select="@uuid"/><apply-templates/>
 </text>
 </template>
 
-<template match="obd">
+<template match="osd">
 dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
-objectClass: OBD
+objectClass: OSD
 lustreName: <value-of select="@name"/>
 uuid: <value-of select="@uuid"/>
-activeRef: <value-of select="active_ref/@uuidref"/>
-obdtype: <value-of select="@obdtype"/>
+nodeRef: <value-of select="node_ref/@uuidref"/>
+targetRef: <value-of select="target_ref/@uuidref"/>
+osdtype: <value-of select="@osdtype"/>
 <if test="fstype">
 fstype: <value-of select="fstype"/>
 </if>
@@ -163,15 +164,31 @@ uuid: <value-of select="@uuid"/>
 </text>
 </template>
 
+<template match="ptlrpc">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: PTLRPC
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/>
+<text>
+</text>
+</template>
 
 <template match="ldlm_ref">
 ldlmRef: <value-of select="@uuidref"/>
 </template>
 
+<template match="ptlrpc_ref">
+ptlrpcRef: <value-of select="@uuidref"/>
+</template>
+
 <template match="obd_ref">
 obdRef: <value-of select="@uuidref"/>
 </template>
 
+<template match="osd_ref">
+osdRef: <value-of select="@uuidref"/>
+</template>
+
 <template match="ost_ref">
 ostRef: <value-of select="@uuidref"/>
 </template>
index 8629444..d0cfdac 100644 (file)
@@ -1,4 +1,4 @@
 dn: fs=lustre
 fs:lustre
 objectClass: lustre
-desc: Lustre Config
+lustreDesc: Lustre Config
index d51fb40..6384d30 100644 (file)
@@ -20,6 +20,10 @@ bad_cc() {
        echo "  please get an updated compiler."
        AC_MSG_ERROR(sorry)
 }
+TMP_VERSION=`echo $CC_VERSION | cut -c 1-16`
+if test "$TMP_VERSION" = "gcc version 2.95"; then
+        bad_cc
+fi
 case "$CC_VERSION" in 
        # ost_pack_niobuf putting 64bit NTOH temporaries on the stack
        # without "sub    $0xc,%esp" to protect the stack from being
@@ -53,6 +57,17 @@ fi
 AC_SUBST(LIBREADLINE)
 AC_SUBST(HAVE_LIBREADLINE)
 
+# XXX this should be a runtime option
+AC_ARG_ENABLE(ost_recovery, [  --enable-ost-recovery: enable support for ost recovery],,
+             enable_ost_recovery="yes")
+if test "$enable_ost_recovery" = "yes" ; then
+   ENABLE_OST_RECOVERY="-DOST_RECOVERY=1"
+else 
+   HAVE_LIBREADLINE=""
+fi
+AC_SUBST(ENABLE_OST_RECOVERY)
+
+
 # Kernel build environment.
 ac_default_prefix=
 bindir='${exec_prefix}/usr/bin'
@@ -65,22 +80,18 @@ AC_ARG_ENABLE(linuxdir, [  --enable-linuxdir=[path] (deprecated) set path to Lin
 LINUX=$enable_linuxdir
 AC_SUBST(LINUX)
 
-sinclude(archdep.m4)
-
 AC_MSG_CHECKING(if you are running linux 2.5...)
 if test -e $LINUX/include/linux/namei.h ; then
-       linux25=yes
+       linux25="yes"
        AC_MSG_RESULT(yes)
 else
-       linux25=no
+       linux25="no"
        AC_MSG_RESULT(no)
 fi
 AM_CONDITIONAL(LINUX25, test x$linux25 = xyes)
 
-# Changed by Amrut Joshi on 01/13/2003
-#KINCFLAGS='-I. -I$(top_srcdir)/include -I$(PORTALS)/include -I$(LINUX)/include'
-KINCFLAGS='-I$(top_srcdir)/include -I$(PORTALS)/include -I$(LINUX)/include'
-CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS"
+sinclude(archdep.m4)
+
 
 portalsdir_def='$(top_srcdir)/../portals'
 AC_ARG_WITH(portals, [  --with-portals=[path] set path to Portals source (default=../portals)], enable_portalsdir=$withval)
@@ -104,12 +115,27 @@ AC_ARG_ENABLE(portalslib, [  --enable-portalslib=[path] (deprecated) set path to
 if ! test -z "$enable_portalslib"; then
        PORTALSLIB=${enable_portalslib}
 fi
+AC_SUBST(PORTALSLIB)
 
+AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
+AC_MSG_CHECKING(if you are building lib lustre)
+if test "$host_cpu" = "lib"; then
+   AC_MSG_RESULT(yes)
+   libdir='${exec_prefix}/lib/lustre'
+else
+   AC_MSG_RESULT(no)
+fi
 
-AC_SUBST(PORTALSLIB)
+if test $host_cpu != "lib" ; then 
+KINCFLAGS='-I$(top_srcdir)/include -I$(PORTALS)/include -I$(LINUX)/include'
+else
+KINCFLAGS='-I$(top_srcdir)/include -I$(PORTALS)/include'
+fi
+CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS"
 
-AC_MSG_CHECKING(if make dep has been run in kernel source)
-if test -f $LINUX/include/linux/config.h ; then
+if test $host_cpu != "lib" ; then 
+AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) )
+if  test -f $LINUX/include/linux/config.h ; then
        AC_MSG_RESULT(yes)
 else
        AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.)
@@ -122,7 +148,6 @@ else
        AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.)
 fi
 
-
 AC_MSG_CHECKING(for Linux release)
 
 dnl We need to rid ourselves of the nasty [ ] quotes.
@@ -140,6 +165,7 @@ AC_SUBST(modulefsdir)
 AC_MSG_RESULT($RELEASE)
 AC_SUBST(RELEASE)
 
+fi
 # Directories for documentation and demos.
 docdir='${prefix}/usr/share/doc/$(PACKAGE)'
 AC_SUBST(docdir)
@@ -151,6 +177,7 @@ AC_SUBST(demodir)
 # AM_CONFIG_HEADER(include/config.h)
 
 AC_OUTPUT(Makefile lib/Makefile ldlm/Makefile obdecho/Makefile ptlrpc/Makefile \
+       liblustre/Makefile \
        lov/Makefile osc/Makefile mdc/Makefile mds/Makefile ost/Makefile \
        cobd/Makefile ptlbd/Makefile conf/Makefile \
        utils/Makefile utils/lconf tests/Makefile obdfilter/Makefile \
index 3fc2b66..d1de59b 100644 (file)
@@ -15,9 +15,9 @@ EXTRA_PROGRAMS = extN
 #       (or other RH < 12.5 kernels) use the "chaos22" patch instead.
 EXTN_FIXES = patch-2.4.18-chaos22
 #EXTN_FIXES = ext3-2.4.18-fixes.diff
-EXTNP = htree-ext3-2.4.18.diff linux-2.4.18ea-0.8.26.diff
+EXTNP = htree-ext3-2.4.18.diff linux-2.4.18ea-0.8.26.diff ext3-2.4-ino_t.diff
 EXTNP+= ext3-2.4.18-ino_sb_macro.diff extN-misc-fixup.diff extN-noread.diff
-EXTNP+= extN-wantedi.diff
+EXTNP+= extN-wantedi.diff extN-san.diff extN-2.4.18-ino_sb_fixup.diff
 #EXTNP+= extN-iget-debug.diff
 EXTNC = balloc.c bitmap.c dir.c file.c fsync.c ialloc.c inode.c ioctl.c
 EXTNC+= namei.c super.c symlink.c
@@ -27,8 +27,8 @@ EXTN_EXTRA += include/linux/quotaops.h
 extN_SOURCES = $(EXTNC) xattr.c # punch.c
 extN_DEPENDENCIES = patch-stamp
 EXTRA_DIST = $(EXTNP) $(EXTN_FIXES) \
-       extN-2.4.18-ino_sb_fixup.diff extN-2.4.18-exports.diff \
-       $(wildcard extN.patch-*)
+       ext3-largefile.diff extN-2.4.18-exports.diff \
+       ext3-use-after-free.diff ext3-unmount_sync.diff $(wildcard extN.patch-*)
 DISTCLEANFILES = -r $(extN_SOURCES) sed-stamp patch-stamp *.orig *.rej
 SUB=-e "s/ext3/extN/g" -e "s/EXT3/EXTN/g" -e "s/extern __inline__/static inline/"
 
@@ -80,14 +80,14 @@ sed-stamp:
        rm -rf $(extN_orig) $(extN_include_orig)
        mkdir $(extN_orig) $(extN_include_orig)
        list='$(EXTNC)'; for f in $$list; do                                  \
-          echo "creating $(extN_orig)/$$f";                            \
-          sed $(SUB) $(LINUX)/fs/ext3/$$f > $(extN_orig)/$$f; \
-        done
+          echo "creating $(extN_orig)/$$f";                                   \
+          sed $(SUB) $(LINUX)/fs/ext3/$$f > $(extN_orig)/$$f;                 \
+       done
        list='$(EXTNI)'; for i in $$list; do                                  \
           s=`echo $$i | sed "s/extN/ext3/"`;                                  \
-          echo "creating $(extN_include_orig)/$$i"; \
-          sed $(SUB) $(LINUX)/include/linux/$$s > $(extN_include_orig)/$$i; \
-        done
+          echo "creating $(extN_include_orig)/$$i";                           \
+          sed $(SUB) $(LINUX)/include/linux/$$s > $(extN_include_orig)/$$i;   \
+       done
        echo timestamp > $@
 
 
@@ -117,18 +117,15 @@ patch-stamp: sed-stamp $(EXTNP)
            grep -q extN_mark_inode_dirty && list="$(EXTN_FIXES) $$list";     \
          grep -q "if (do_sync_supers)" $(extN_orig)/super.c &&               \
            list="ext3-unmount_sync.diff $$list";                             \
+         grep -q "ext3_journal_start(inode, 2)" $(extN_orig)/inode.c ||      \
+           list="ext3-largefile.diff $$list";                                \
+         grep -q "EXPORT_SYMBOL(extN_bread)" $(extN_orig)/super.c ||         \
+           list="$$list extN-2.4.18-exports.diff";                           \
          for p in $$list; do                                                 \
            echo "applying patch $$p";                                        \
            sed $(SUB) $(srcdir)/$$p |                                        \
              (cd $(top_builddir) && patch -p1) || exit $$?;                  \
          done;                                                               \
-         echo "It is OK if the next patch says it is skipping this patch";   \
-         echo "applying patch $(srcdir)/extN-2.4.18-exports.diff";           \
-         (cd $(top_builddir) &&                                              \
-           patch -N -p1) < $(srcdir)/extN-2.4.18-exports.diff;               \
-         echo "applying patch $(srcdir)/extN-2.4.18-ino_sb_fix.diff";        \
-         (cd $(top_builddir) &&                                              \
-           patch -p1) < $(srcdir)/extN-2.4.18-ino_sb_fixup.diff || exit $$?; \
        fi
        echo timestamp > $@
 
diff --git a/lustre/extN/ext3-2.4-ino_t.diff b/lustre/extN/ext3-2.4-ino_t.diff
new file mode 100644 (file)
index 0000000..ce1bd88
--- /dev/null
@@ -0,0 +1,138 @@
+--- linux/fs/ext3/ialloc.c.orig        Sat Oct 19 11:42:23 2002
++++ linux/fs/ext3/ialloc.c     Sat Jan  4 12:14:18 2003
+@@ -64,8 +64,8 @@ static int read_inode_bitmap (struct sup
+       if (!bh) {
+               ext3_error (sb, "read_inode_bitmap",
+                           "Cannot read inode bitmap - "
+-                          "block_group = %lu, inode_bitmap = %lu",
+-                          block_group, (unsigned long) gdp->bg_inode_bitmap);
++                          "block_group = %lu, inode_bitmap = %u",
++                          block_group, gdp->bg_inode_bitmap);
+               retval = -EIO;
+       }
+       /*
+@@ -531,19 +532,19 @@ out:
+ }
+ /* Verify that we are loading a valid orphan from disk */
+-struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino)
++struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
+ {
+-      ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
++      unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
+       unsigned long block_group;
+       int bit;
+       int bitmap_nr;
+       struct buffer_head *bh;
+       struct inode *inode = NULL;
+-      
++
+       /* Error cases - e2fsck has already cleaned up for us */
+       if (ino > max_ino) {
+               ext3_warning(sb, __FUNCTION__,
+-                           "bad orphan ino %ld!  e2fsck was run?\n", ino);
++                           "bad orphan ino %lu!  e2fsck was run?\n", ino);
+               return NULL;
+       }
+@@ -552,7 +553,7 @@ struct inode *ext3_orphan_get (struct su
+       if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 ||
+           !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) {
+               ext3_warning(sb, __FUNCTION__,
+-                           "inode bitmap error for orphan %ld\n", ino);
++                           "inode bitmap error for orphan %lu\n", ino);
+               return NULL;
+       }
+@@ -563,7 +564,7 @@ struct inode *ext3_orphan_get (struct su
+       if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) ||
+           is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) {
+               ext3_warning(sb, __FUNCTION__,
+-                           "bad orphan inode %ld!  e2fsck was run?\n", ino);
++                           "bad orphan inode %lu!  e2fsck was run?\n", ino);
+               printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n",
+                      bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data));
+               printk(KERN_NOTICE "inode=%p\n", inode);
+@@ -570,9 +571,9 @@ struct inode *ext3_orphan_get (struct su
+               if (inode) {
+                       printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+                              is_bad_inode(inode));
+-                      printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n",
++                      printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+                              NEXT_ORPHAN(inode));
+-                      printk(KERN_NOTICE "max_ino=%ld\n", max_ino);
++                      printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+               }
+               /* Avoid freeing blocks if we got a bad deleted inode */
+               if (inode && inode->i_nlink == 0)
+--- linux/fs/ext3/namei.c.orig Sat Oct 19 11:42:45 2002
++++ linux/fs/ext3/namei.c      Sat Jan  4 12:13:27 2003
+@@ -716,10 +716,10 @@ int ext3_orphan_del(handle_t *handle, st
+ {
+       struct list_head *prev;
+       struct ext3_sb_info *sbi;
+-      ino_t ino_next; 
++      unsigned long ino_next;
+       struct ext3_iloc iloc;
+       int err = 0;
+-      
++
+       lock_super(inode->i_sb);
+       if (list_empty(&inode->u.ext3_i.i_orphan)) {
+               unlock_super(inode->i_sb);
+@@ -730,7 +730,7 @@ int ext3_orphan_del(handle_t *handle, st
+       prev = inode->u.ext3_i.i_orphan.prev;
+       sbi = EXT3_SB(inode->i_sb);
+-      jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
++      jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+       list_del(&inode->u.ext3_i.i_orphan);
+       INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+@@ -741,13 +741,13 @@ int ext3_orphan_del(handle_t *handle, st
+        * list in memory. */
+       if (!handle)
+               goto out;
+-      
++
+       err = ext3_reserve_inode_write(handle, inode, &iloc);
+       if (err)
+               goto out_err;
+       if (prev == &sbi->s_orphan) {
+-              jbd_debug(4, "superblock will point to %ld\n", ino_next);
++              jbd_debug(4, "superblock will point to %lu\n", ino_next);
+               BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, sbi->s_sbh);
+               if (err)
+@@ -758,8 +758,8 @@ int ext3_orphan_del(handle_t *handle, st
+               struct ext3_iloc iloc2;
+               struct inode *i_prev =
+                       list_entry(prev, struct inode, u.ext3_i.i_orphan);
+-              
+-              jbd_debug(4, "orphan inode %ld will point to %ld\n",
++
++              jbd_debug(4, "orphan inode %lu will point to %lu\n",
+                         i_prev->i_ino, ino_next);
+               err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
+               if (err)
+@@ -774,7 +774,7 @@ int ext3_orphan_del(handle_t *handle, st
+       if (err)
+               goto out_brelse;
+-out_err:      
++out_err: 
+       ext3_std_error(inode->i_sb, err);
+ out:
+       unlock_super(inode->i_sb);
+--- linux/include/linux/ext3_fs.h.orig Thu Jan  2 16:10:24 2003
++++ linux/include/linux/ext3_fs.h      Sat Jan  4 12:25:41 2003
+@@ -622,7 +622,7 @@ extern int ext3_sync_file (struct file *
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+-extern struct inode * ext3_orphan_get (struct super_block *, ino_t);
++extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+ extern unsigned long ext3_count_free_inodes (struct super_block *);
+ extern void ext3_check_inodes_bitmap (struct super_block *);
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
index a49d5da..cc47588 100644 (file)
        struct list_head *prev;
 +      struct ext3_inode_info *ei = EXT3_I(inode);
        struct ext3_sb_info *sbi;
-       ino_t ino_next; 
+       unsigned long ino_next;
        struct ext3_iloc iloc;
        int err = 0;
-       
        lock_super(inode->i_sb);
 -      if (list_empty(&inode->u.ext3_i.i_orphan)) {
 +      if (list_empty(&ei->i_orphan)) {
 +      prev = ei->i_orphan.prev;
        sbi = EXT3_SB(inode->i_sb);
  
-       jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
+       jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
  
 -      list_del(&inode->u.ext3_i.i_orphan);
 -      INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
  
        /* If we're on an error path, we may not have a valid
         * transaction handle with which to update the orphan list on
-@@ -1520,9 +1520,8 @@ int ext3_orphan_del(handle_t *handle, st
+@@ -1520,8 +1520,7 @@ int ext3_orphan_del(handle_t *handle, st
                err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
        } else {
                struct ext3_iloc iloc2;
 -              struct inode *i_prev =
 -                      list_entry(prev, struct inode, u.ext3_i.i_orphan);
--              
 +              struct inode *i_prev = orphan_list_entry(prev);
-+
-               jbd_debug(4, "orphan inode %ld will point to %ld\n",
+               jbd_debug(4, "orphan inode %lu will point to %lu\n",
                          i_prev->i_ino, ino_next);
-               err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
 @@ -1695,10 +1695,10 @@ static int ext3_symlink (struct inode * 
                        goto out_no_entry;
        } else {
diff --git a/lustre/extN/ext3-largefile.diff b/lustre/extN/ext3-largefile.diff
new file mode 100644 (file)
index 0000000..db41aab
--- /dev/null
@@ -0,0 +1,23 @@
+Under rare conditions (filesystem corruption, really) it is possible
+for ext3_dirty_inode() to require _two_ blocks for the transaction: one
+for the inode and one to update the superblock - to set
+EXT3_FEATURE_RO_COMPAT_LARGE_FILE.  This causes the filesystem to go
+BUG.
+
+So reserve an additional block for that eventuality.
+
+
+ fs/ext3/inode.c |    2 +-
+ 1 files changed, 1 insertion(+), 1 deletion(-)
+
+--- 25/fs/ext3/inode.c~ext3-transaction-reserved-blocks        Sat Dec 14 18:28:21 2002
++++ 25-akpm/fs/ext3/inode.c    Sat Dec 14 18:28:21 2002
+@@ -2698,7 +2698,7 @@ void ext3_dirty_inode(struct inode *inod
+       handle_t *handle;
+       lock_kernel();
+-      handle = ext3_journal_start(inode, 1);
++      handle = ext3_journal_start(inode, 2);
+       if (IS_ERR(handle))
+               goto out;
+       if (current_handle &&
diff --git a/lustre/extN/extN-san.diff b/lustre/extN/extN-san.diff
new file mode 100644 (file)
index 0000000..4d0f277
--- /dev/null
@@ -0,0 +1,88 @@
+--- lustre/extN/inode.orig.c   2002-12-29 18:48:56.000000000 +0800
++++ lustre/extN/inode.c        2002-12-29 19:17:24.000000000 +0800
+@@ -2728,3 +2728,85 @@
+  * here, in extN_aops_journal_start() to ensure that the forthcoming "see if we
+  * need to extend" test in extN_prepare_write() succeeds.  
+  */
++
++/* for each block: 1 ind + 1 dind + 1 tind
++ * for each block: 3 bitmap blocks
++ * for each block: 3 group descriptor blocks
++ * i inode block
++ * 1 superblock
++ * 2 * EXTN_SINGLEDATA_TRANS_BLOCKS for the quote files
++ * ((1+1+1) * 3 * nblocks) + 1 + 1 + 2 * EXTN_SINGLEDATA_TRANS_BLOCKS
++ *
++ * XXX assuming:
++ * (1) fs logic block size == page size
++ * (2) extN in writeback mode
++ */
++static inline int extN_san_write_trans_blocks(int nblocks)
++{
++      int ret;
++      
++      ret = (1 + 1 + 1) * 3 * nblocks + 1 + 1;
++
++#ifdef CONFIG_QUOTA
++      ret += 2 * EXTN_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++      return ret;
++}
++
++/* Alloc blocks for an inode, while don't create any buffer/page
++ * for data I/O; set the inode size if file is extended.
++ *
++ * @inode:    target inode
++ * @blocks:   array of logic block number
++ * @nblocks:  how many blocks need be alloced
++ * @newsize:  new filesize we should set
++ *
++ * return:    0 success, otherwise failed
++ *            (*blocks) contains physical block number alloced
++ *
++ * XXX this assume the fs block size == page size
++ */
++int extN_prep_san_write(struct inode *inode, long *blocks,
++                      int nblocks, loff_t newsize)
++{
++      handle_t *handle;
++      struct buffer_head bh_tmp;
++      int needed_blocks;
++      int i, ret = 0, ret2;
++
++      needed_blocks = extN_san_write_trans_blocks(nblocks);
++
++      lock_kernel();
++      handle = extN_journal_start(inode, needed_blocks);
++      if (IS_ERR(handle)) {
++              unlock_kernel();
++              return PTR_ERR(handle);
++      }
++      unlock_kernel();
++
++      /* alloc blocks one by one */
++      for (i = 0; i < nblocks; i++) {
++              ret = extN_get_block_handle(handle, inode, blocks[i],
++                                              &bh_tmp, 1);
++              if (ret)
++                      break;
++
++              blocks[i] = bh_tmp.b_blocknr;
++      }
++
++      /* set inode size if needed */
++      if (!ret && (newsize > inode->i_size)) {
++              inode->i_size = newsize;
++              extN_mark_inode_dirty(handle, inode);
++      }
++
++      lock_kernel();
++      ret2 = extN_journal_stop(handle, inode);
++      unlock_kernel();
++
++      if (!ret)
++              ret = ret2;
++      return ret;
++}
++EXPORT_SYMBOL(extN_prep_san_write);
diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h
new file mode 100644 (file)
index 0000000..0b37021
--- /dev/null
@@ -0,0 +1,431 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <info@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * User-space Lustre headers.
+ *
+ */
+#ifndef LIBLUSTRE_H__
+#define LIBLUSTRE_H__
+
+#include <sys/mman.h>
+#include <asm/page.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+
+#include <portals/list.h>
+#include <portals/p30.h>
+
+/* definitions for liblustre */
+
+/* always adopt 2.5 definitions */
+#define LINUX_VERSION_CODE 1
+#define KERNEL_VERSION(a,b,c) 0
+
+static inline void inter_module_put(void *a)
+{
+        return;
+}
+
+extern ptl_handle_ni_t         tcpnal_ni;
+
+static inline void *inter_module_get(char *arg)
+{
+
+        if (strcmp(arg, "tcpnal_ni") == 0 )
+                return &tcpnal_ni;
+        else
+                return NULL;
+
+}
+
+
+/* cheats for now */
+
+struct work_struct {
+        void (*ws_task)(void *arg);
+        void *ws_arg;
+};
+
+static inline void prepare_work(struct work_struct *q, void (*t)(void *),
+                                void *arg)
+{
+        q->ws_task = t;
+        q->ws_arg = arg;
+        return;
+}
+
+static inline void schedule_work(struct work_struct *q)
+{
+        q->ws_task(q->ws_arg);
+}
+
+
+#define strnlen(a,b) strlen(a)
+static inline void *kmalloc(int size, int prot)
+{
+        return malloc(size);
+}
+#define vmalloc malloc
+#define vfree free
+#define kfree(a) free(a)
+#define GFP_KERNEL 1
+#define GFP_HIGHUSER 1
+#define IS_ERR(a) (abs((int)(a)) < 500 ? 1 : 0)
+#define PTR_ERR(a) ((int)(a))
+
+#define capable(foo) 1
+#define CAP_SYS_ADMIN 1
+
+typedef struct {
+        void *cwd;
+
+}mm_segment_t;
+
+typedef void *read_proc_t;
+typedef void *write_proc_t;
+
+
+/* modules */
+
+struct module {
+        int count;
+};
+
+static inline void MODULE_AUTHOR(char *name)
+{
+        printf("%s\n", name);
+}
+#define MODULE_DESCRIPTION(name) MODULE_AUTHOR(name)
+#define MODULE_LICENSE(name) MODULE_AUTHOR(name)
+
+#define THIS_MODULE NULL
+#define __init
+#define __exit
+
+/* devices */
+
+static inline int misc_register(void *foo)
+{
+        return 0;
+}
+#define misc_deregister misc_register
+
+#define __MOD_INC_USE_COUNT(m)  do {int a = 1; a++; } while (0)
+#define __MOD_DEC_USE_COUNT(m)  do {int a = 1; a++; } while (0)
+#define MOD_INC_USE_COUNT  do {int a = 1; a++; } while (0)
+#define MOD_DEC_USE_COUNT  do {int a = 1; a++; } while (0)
+
+/* module initialization */
+extern int init_obdclass(void);
+extern int ptlrpc_init(void);
+extern int ldlm_init(void);
+extern int osc_init(void);
+extern int lov_init(void);
+extern int echo_client_init(void);
+
+
+
+/* general stuff */
+#define jiffies 0
+
+#define EXPORT_SYMBOL(S)
+
+typedef int spinlock_t;
+typedef __u64 kdev_t;
+
+#define SPIN_LOCK_UNLOCKED 0
+#define spin_lock(l) do {int a = 1; a++; } while (0)
+#define spin_unlock(l) do {int a= 1; a++; } while (0)
+#define spin_lock_init(l) do {int a= 1; a++; } while (0)
+static inline void spin_lock_bh(spinlock_t *l)
+{
+        return;
+}
+static inline void spin_unlock_bh(spinlock_t *l)
+{
+        return;
+}
+static inline void spin_lock_irqrestore(a,b)
+{
+        return;
+}
+static inline void spin_unlock_irqrestore(a,b)
+{
+        return;
+}
+static inline void spin_lock_irqsave(a,b)
+{
+        return;
+}
+
+#define barrier() do {int a= 1; a++; } while (0)
+
+/* registering symbols */
+
+#define ERESTARTSYS ERESTART
+#define HZ 1
+
+/* random */
+
+static inline void get_random_bytes(void *ptr, int size)
+{
+        static int r;
+        int *p = (int *)ptr;
+        int *end = p + (size / sizeof(int));
+        r = rand();
+        while ( p + sizeof(int) < end ) {
+                *p = r;
+                p++;
+        }
+}
+
+/* memory */
+
+static inline int copy_from_user(void *a,void *b, int c)
+{
+        memcpy(a,b,c);
+        return 0;
+}
+
+static inline int copy_to_user(void *a,void *b, int c)
+{
+        memcpy(a,b,c);
+        return 0;
+}
+
+
+/* slabs */
+typedef struct {
+         int size;
+} kmem_cache_t;
+#define SLAB_HWCACHE_ALIGN 0
+static inline kmem_cache_t *kmem_cache_create(name,objsize,cdum,d,e,f)
+{
+        kmem_cache_t *c;
+        c = malloc(sizeof(*c));
+        if (!c)
+                return NULL;
+        c->size = objsize;
+        return c;
+};
+
+static inline int kmem_cache_destroy(kmem_cache_t *a)
+{
+        free(a);
+        return 0;
+}
+#define kmem_cache_validate(a,b) 1
+#define kmem_cache_alloc(cache, prio) malloc(cache->size)
+#define kmem_cache_free(cache, obj) OBD_FREE(obj, cache->size)
+#define PORTAL_SLAB_ALLOC(lock,cache,size) do { lock = kmem_cache_alloc(cache,prio); } while (0)
+#define PORTAL_SLAB_FREE(lock,cache,size) do { lock = kmem_cache_alloc(cache,prio); } while (0)
+
+struct page {
+        void *addr;
+        int index;
+};
+
+#define kmap(page) (page)->addr
+#define kunmap(a) do { int foo = 1; foo++; } while (0)
+
+static inline struct page *alloc_pages(mask,foo)
+{
+        struct page *pg = malloc(sizeof(*pg));
+
+        if (!pg)
+                return NULL;
+#ifdef MAP_ANONYMOUS
+        pg->addr = mmap(0, PAGE_SIZE, PROT_WRITE, MAP_ANONYMOUS, 0, 0);
+#else
+        pg->addr = malloc(PAGE_SIZE);
+#endif
+
+        if (!pg->addr) {
+                free(pg);
+                return NULL;
+        }
+        return pg;
+}
+
+static inline void __free_pages(struct page *pg, int what)
+{
+#ifdef MAP_ANONYMOUS
+        munmap(pg->addr, PAGE_SIZE);
+#else
+        free(pg->addr);
+#endif
+        free(pg);
+}
+
+/* arithmetic */
+#define do_div(a,b) (a)/(b)
+
+/* dentries / intents */
+struct lookup_intent {
+        void *it_iattr;
+};
+
+struct iattr {
+        int mode;
+};
+
+struct dentry {
+        int d_count;
+};
+struct file {
+        struct dentry *f_dentry;
+        void *private_data;
+} ;
+
+struct vfsmount {
+        void *pwd;
+};
+#define cpu_to_le32(x) ((__u32)(x))
+
+/* semaphores */
+struct semaphore {
+        int count;
+};
+
+#define down(a) do {(a)->count++;} while (0)
+#define up(a) do {(a)->count--;} while (0)
+#define sema_init(a,b) do { (a)->count = b; } while (0)
+
+typedef struct  {
+        struct list_head sleepers;
+} wait_queue_head_t;
+
+typedef struct  {
+        struct list_head sleeping;
+        void *process;
+} wait_queue_t;
+
+struct signal {
+        int signal;
+};
+
+struct task_struct {
+        int state;
+        struct signal pending;
+        char comm[32];
+        int pid;
+};
+
+extern struct task_struct *current;
+
+
+
+#define set_current_state(foo) do { current->state = foo; } while (0)
+
+#define init_waitqueue_entry(q,p) do { (q)->process = p; } while (0)
+#define add_wait_queue(q,p) do {  list_add(&(q)->sleepers, &(p)->sleeping); } while (0)
+#define del_wait_queue(p) do { list_del(&(p)->sleeping); } while (0)
+#define remove_wait_queue(q,p) do { list_del(&(p)->sleeping); } while (0)
+
+#define init_waitqueue_head(l) INIT_LIST_HEAD(&(l)->sleepers)
+#define wake_up(l) do { int a; a++; } while (0)
+#define wait_event(l,m) do { int a; a++; } while (0)
+#define TASK_INTERRUPTIBLE 0
+#define TASK_UNINTERRUPTIBLE 1
+#define TASK_RUNNING 2
+
+
+#define schedule() do { int a; a++; } while (0)
+static inline int schedule_timeout(t)
+{
+        return 0;
+}
+
+#define lock_kernel() do { int a; a++; } while (0)
+#define daemonize(l) do { int a; a++; } while (0)
+#define sigfillset(l) do { int a; a++; } while (0)
+#define recalc_sigpending(l) do { int a; a++; } while (0)
+#define kernel_thread(l,m,n)
+
+static inline int call_usermodehelper(char *prog, char **argv, char **evnp)
+{
+        return 0;
+}
+
+
+
+#define KERN_INFO
+
+
+
+struct timer_list {
+        struct list_head tl_list;
+        void (*function)(unsigned long unused);
+        void *data;
+        int expires;
+};
+
+static inline int timer_pending(struct timer_list *l)
+{
+        if (l->expires > jiffies)
+                return 1;
+        else
+                return 0;
+}
+
+static inline int init_timer(struct timer_list *l)
+{
+        INIT_LIST_HEAD(&l->tl_list);
+        return 0;
+}
+
+static inline void mod_timer(struct timer_list *l, int thetime)
+{
+        l->expires = thetime;
+}
+
+static inline void del_timer(struct timer_list *l)
+{
+        free(l);
+}
+
+typedef struct { volatile int counter; } atomic_t;
+
+#define atomic_read(a) ((a)->counter)
+#define atomic_set(a,b) do {(a)->counter = b; } while (0)
+#define atomic_dec_and_test(a) ((--((a)->counter)) == 0)
+#define atomic_inc(a)  (((a)->counter)++)
+#define atomic_dec(a)  do { (a)->counter--; } while (0)
+#define atomic_add(b,a)  do {(a)->counter += b;} while (0)
+#define atomic_sub(b,a)  do {(a)->counter -= b;} while (0)
+
+#define LBUG() do { sleep(1000000); } while (0)
+
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_export.h>
+#include <linux/lustre_net.h>
+
+
+#endif
+
index 14a713c..cd8f12b 100644 (file)
 #ifndef _LPROCFS_SNMP_H
 #define _LPROCFS_SNMP_H
 
+#ifdef __KERNEL__
 #include <linux/autoconf.h>
 #include <linux/proc_fs.h>
+#endif
 
 #ifndef LPROCFS
 #ifdef  CONFIG_PROC_FS  /* Ensure that /proc is configured */
@@ -48,9 +50,6 @@ struct lprocfs_static_vars {
 /* class_obd.c */
 extern struct proc_dir_entry *proc_lustre_root;
 
-extern void lprocfs_init_vars(struct lprocfs_static_vars *var);
-extern void lprocfs_init_multi_vars(unsigned int idx, 
-                                    struct lprocfs_static_vars *var);
 
 #define LPROCFS_INIT_MULTI_VARS(array, size)                              \
 void lprocfs_init_multi_vars(unsigned int idx,                            \
@@ -63,6 +62,7 @@ void lprocfs_init_multi_vars(unsigned int idx,                            \
    x->obd_vars = glob[idx].obd_vars;                                      \
 }                                                                         \
 
+#ifdef LPROCFS
 #define LPROCFS_INIT_VARS(vclass, vinstance)           \
 void lprocfs_init_vars(struct lprocfs_static_vars *x)  \
 {                                                      \
@@ -70,7 +70,9 @@ void lprocfs_init_vars(struct lprocfs_static_vars *x)  \
         x->obd_vars = vinstance;                       \
 }                                                      \
 
-#ifdef LPROCFS
+extern void lprocfs_init_vars(struct lprocfs_static_vars *var);
+extern void lprocfs_init_multi_vars(unsigned int idx, 
+                                    struct lprocfs_static_vars *var);
 /* lprocfs_status.c */
 extern int lprocfs_add_vars(struct proc_dir_entry *root,
                             struct lprocfs_vars *var,
@@ -133,6 +135,7 @@ int fct_name(char *page, char **start, off_t off,                \
 static inline struct proc_dir_entry *
 lprocfs_register(const char *name, struct proc_dir_entry *parent,
                  struct lprocfs_vars *list, void *data) { return NULL; }
+static inline void lprocfs_init_vars(struct lprocfs_static_vars *x) { return; }
 static inline int lprocfs_add_vars(struct proc_dir_entry *root,
                                    struct lprocfs_vars *var,
                                    void *data) { return 0; }
index 9fad75f..c120225 100644 (file)
@@ -7,12 +7,13 @@
 #define _LUSTRE_DLM_H__
 
 #ifdef __KERNEL__
-
 #include <linux/proc_fs.h>
+#endif 
+
 #include <linux/lustre_lib.h>
 #include <linux/lustre_net.h>
 #include <linux/lustre_import.h>
-#include <linux/handles.h>
+#include <linux/lustre_handles.h>
 
 struct obd_ops;
 struct obd_device;
@@ -197,9 +198,6 @@ typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
 #define LDLM_MIN_TYPE 10
 #define LDLM_MAX_TYPE 11
 
-extern ldlm_res_compat ldlm_res_compat_table [];
-extern ldlm_res_policy ldlm_res_policy_table [];
-
 struct ldlm_resource {
         struct ldlm_namespace *lr_namespace;
         struct list_head       lr_hash;
@@ -235,8 +233,8 @@ struct ldlm_ast_work {
 
 /* Per-export ldlm state. */
 struct ldlm_export_data {
-        struct list_head        led_held_locks;
-        struct obd_import       led_import;
+        struct list_head       led_held_locks; /* protected by namespace lock */
+        struct obd_import      led_import;
 };
 
 extern struct obd_ops ldlm_obd_ops;
@@ -295,11 +293,9 @@ do {                                                                          \
         }                                                                     \
 } while (0)
 
-/* I hate hate hate hate hate this.  This cannot stay.  bug 850. -phil */
-#define LDLM_DEBUG0(lock, format) __LDLM_DEBUG(D_DLMTRACE, lock, format"%s","")
-
-#define LDLM_DEBUG(lock, format, a...) __LDLM_DEBUG(D_DLMTRACE, lock, format, a)
-#define LDLM_ERROR(lock, format, a...) __LDLM_DEBUG(D_ERROR, lock, format, a)
+#define LDLM_DEBUG(lock, format, a...) __LDLM_DEBUG(D_DLMTRACE, lock, \
+                                                    format, ## a)
+#define LDLM_ERROR(lock, format, a...) __LDLM_DEBUG(D_ERROR, lock, format, ## a)
 
 #define LDLM_DEBUG_NOLOCK(format, a...)                 \
         CDEBUG(D_DLMTRACE, "### " format "\n" , ## a)
@@ -473,7 +469,6 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns);
 int mds_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                      void *data, int flag);
 
-#endif /* __KERNEL__ */
 
 /* ioctls for trying requests */
 #define IOC_LDLM_TYPE                   'f'
index 74b8dca..694bd3e 100644 (file)
@@ -10,8 +10,6 @@
 #ifndef __EXPORT_H
 #define __EXPORT_H
 
-#ifdef __KERNEL__
-
 #include <linux/lustre_idl.h>
 #include <linux/lustre_dlm.h>
 #include <linux/lustre_mds.h>
@@ -22,10 +20,6 @@ struct lov_export_data {
         struct list_head led_open_head;
 };
 
-struct ost_export_data {
-        struct obd_uuid oed_uuid; /* client UUID */
-};
-
 struct ec_export_data { /* echo client */
         struct list_head eced_open_head;
         struct list_head eced_locks;
@@ -43,7 +37,6 @@ struct obd_export {
                 struct mds_export_data    eu_mds_data;
                 struct filter_export_data eu_filter_data;
                 struct lov_export_data    eu_lov_data;
-                struct ost_export_data    eu_ost_data;
                 struct ec_export_data     eu_ec_data;
         } u;
 };
@@ -51,11 +44,9 @@ struct obd_export {
 #define exp_mds_data    u.eu_mds_data
 #define exp_lov_data    u.eu_lov_data
 #define exp_filter_data u.eu_filter_data
-#define exp_ost_data    u.eu_ost_data
 #define exp_ec_data     u.eu_ec_data
 
 extern struct obd_export *class_conn2export(struct lustre_handle *conn);
 extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
-#endif /* __KERNEL__ */
 
 #endif /* __EXPORT_H */
index 341d082..6b0cbfa 100644 (file)
@@ -57,6 +57,8 @@ struct fsfilt_operations {
                                      void *handle, fsfilt_cb_t cb_func);
         int     (* fs_statfs)(struct super_block *sb, struct obd_statfs *osfs);
         int     (* fs_sync)(struct super_block *sb);
+        int     (* fs_prep_san_write)(struct inode *inode, long *blocks,
+                                      int nblocks, loff_t newsize);
 };
 
 extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops);
@@ -77,6 +79,7 @@ extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops);
 static inline void *fsfilt_start(struct obd_device *obd,
                                  struct inode *inode, int op)
 {
+        ENTRY;
         return obd->obd_fsops->fs_start(inode, op);
 }
 
@@ -91,6 +94,7 @@ static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode,
                                 void *handle)
 {
         return obd->obd_fsops->fs_commit(inode, handle);
+        EXIT;
 }
 
 static inline int fsfilt_setattr(struct obd_device *obd, struct dentry *dentry,
@@ -152,6 +156,15 @@ static inline int fsfilt_sync(struct obd_device *obd, struct super_block *fs)
         return obd->obd_fsops->fs_sync(fs);
 }
 
+static inline int fs_prep_san_write(struct obd_device *obd,
+                                    struct inode *inode,
+                                    long *blocks,
+                                    int nblocks,
+                                    loff_t newsize)
+{
+        return obd->obd_fsops->fs_prep_san_write(inode, blocks,
+                                                 nblocks, newsize);
+}
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/lustre/include/linux/lustre_handles.h b/lustre/include/linux/lustre_handles.h
new file mode 100644 (file)
index 0000000..f644cf1
--- /dev/null
@@ -0,0 +1,39 @@
+#ifndef __LINUX_HANDLES_H_
+#define __LINUX_HANDLES_H_
+
+#ifdef __KERNEL__
+#include <asm/types.h>
+#include <asm/atomic.h>
+#include <linux/list.h>
+#endif
+
+typedef void (*portals_handle_addref_cb)(void *object);
+
+/* These handles are most easily used by having them appear at the very top of
+ * whatever object that you want to make handles for.  ie:
+ *
+ * struct ldlm_lock {
+ *         struct portals_handle handle;
+ *         ...
+ * };
+ *
+ * Now you're able to assign the results of cookie2handle directly to an
+ * ldlm_lock.  If it's not at the top, you'll want to hack up a macro that
+ * uses some offsetof() magic. */
+
+struct portals_handle {
+        struct list_head h_link;
+        __u64 h_cookie;
+        portals_handle_addref_cb h_addref;
+};
+
+/* handles.c */
+
+/* Add a handle to the hash table */
+void class_handle_hash(struct portals_handle *, portals_handle_addref_cb);
+void class_handle_unhash(struct portals_handle *);
+void *class_handle2object(__u64 cookie);
+int class_handle_init(void);
+void class_handle_cleanup(void);
+
+#endif
index 6e11240..3ef86ac 100644 (file)
@@ -29,6 +29,7 @@
 # include <asm/types.h>
 # include <linux/types.h>
 # include <linux/list.h>
+# include <linux/string.h> /* for strncpy, below */
 #else
 # define __KERNEL__
 # include <asm/types.h>
@@ -53,8 +54,8 @@ struct obd_uuid {
 
 static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp)
 {
-        strncpy(uuid->uuid, tmp, sizeof(uuid->uuid));
-        uuid->uuid[sizeof(uuid->uuid) - 1] = '\0';
+        strncpy(uuid->uuid, tmp, sizeof(*uuid));
+        uuid->uuid[sizeof(*uuid) - 1] = '\0';
 }
 
 /* FOO_REQUEST_PORTAL is for incoming requests on the FOO
@@ -64,17 +65,17 @@ static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp)
 
 #define CONNMGR_REQUEST_PORTAL  1
 #define CONNMGR_REPLY_PORTAL    2
-#define OSC_REQUEST_PORTAL      3
+//#define OSC_REQUEST_PORTAL      3
 #define OSC_REPLY_PORTAL        4
 #define OSC_BULK_PORTAL         5
 #define OST_REQUEST_PORTAL      6
-#define OST_REPLY_PORTAL        7
+//#define OST_REPLY_PORTAL        7
 #define OST_BULK_PORTAL         8
-#define MDC_REQUEST_PORTAL      9
+//#define MDC_REQUEST_PORTAL      9
 #define MDC_REPLY_PORTAL        10
-#define MDC_BULK_PORTAL         11
+//#define MDC_BULK_PORTAL         11
 #define MDS_REQUEST_PORTAL      12
-#define MDS_REPLY_PORTAL        13
+//#define MDS_REPLY_PORTAL        13
 #define MDS_BULK_PORTAL         14
 #define LDLM_CB_REQUEST_PORTAL     15
 #define LDLM_CB_REPLY_PORTAL       16
@@ -83,7 +84,8 @@ static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp)
 #define PTLBD_REQUEST_PORTAL           19
 #define PTLBD_REPLY_PORTAL             20
 #define PTLBD_BULK_PORTAL              21
-#define MDS_GETATTR_PORTAL      22
+#define MDS_SETATTR_PORTAL      22
+#define MDS_READPAGE_PORTAL     23
 
 #define SVC_KILLED               1
 #define SVC_EVENT                2
@@ -201,6 +203,9 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
 #define OST_OPEN       11
 #define OST_CLOSE      12
 #define OST_STATFS     13
+#define OST_SAN_READ   14
+#define OST_SAN_WRITE  15
+#define OST_SYNCFS     16
 
 
 typedef uint64_t        obd_id;
@@ -221,6 +226,10 @@ typedef uint32_t        obd_count;
 #define OBD_FL_OBDMDEXISTS      (0x00000002)
 
 #define OBD_INLINESZ    60
+#define FD_OSTDATA_SIZE 32
+#if (FD_OSTDATA_SIZE > OBD_INLINESZ)
+# error FD_OSTDATA_SIZE must be smaller than OBD_INLINESZ
+#endif
 
 /* Note: 64-bit types are 64-bit aligned in structure */
 struct obdo {
@@ -282,8 +291,9 @@ struct lov_mds_md {
 #define OBD_MD_FLEASIZE (0x00020000)    /* extended attribute data */
 #define OBD_MD_LINKNAME (0x00040000)    /* symbolic link target */
 #define OBD_MD_FLHANDLE (0x00080000)    /* file handle */
+#define OBD_MD_FLCKSUM  (0x00100000)    /* bulk data checksum */
 #define OBD_MD_FLNOTOBD (~(OBD_MD_FLOBDFLG | OBD_MD_FLBLOCKS | OBD_MD_LINKNAME|\
-                           OBD_MD_FLEASIZE | OBD_MD_FLHANDLE))
+                           OBD_MD_FLEASIZE | OBD_MD_FLHANDLE | OBD_MD_FLCKSUM))
 
 struct obd_statfs {
         __u64           os_type;
@@ -319,7 +329,7 @@ struct niobuf_remote {
         __u32 len;
         __u32 xid;
         __u32 flags;
-};
+} __attribute__((packed));
 
 /* request structure for OST's */
 
@@ -334,16 +344,19 @@ struct ost_body {
  */
 
 /* opcodes */
-#define MDS_GETATTR      1
-#define MDS_GETATTR_NAME 2
-#define MDS_CLOSE        3
-#define MDS_REINT        4
-#define MDS_READPAGE     6
-#define MDS_CONNECT      7
-#define MDS_DISCONNECT   8
-#define MDS_GETSTATUS    9
-#define MDS_STATFS       10
-#define MDS_GETLOVINFO   11
+#define MDS_GETATTR      33
+#define MDS_GETATTR_NAME 34
+#define MDS_CLOSE        35
+#define MDS_REINT        36
+#define MDS_READPAGE     37
+#define MDS_CONNECT      38
+#define MDS_DISCONNECT   39
+#define MDS_GETSTATUS    40
+#define MDS_STATFS       41
+#define MDS_GETLOVINFO   42
+/*
+ * Do not exceed 63 
+ */
 
 #define REINT_SETATTR  1
 #define REINT_CREATE   2
@@ -360,8 +373,6 @@ struct ost_body {
 #define IT_OPEN_CREATE  (1 << 4)
 #define IT_OPEN_OPEN    (1 << 5)
 
-#define IT_UNLINK (1<<8)
-
 #define REINT_OPCODE_MASK 0xff /* opcodes must fit into this mask */
 #define REINT_REPLAYING 0x1000 /* masked into the opcode to indicate replay */
 
@@ -489,6 +500,8 @@ struct mds_rec_rename {
         __u32           rn_fsuid;
         __u32           rn_fsgid;
         __u32           rn_cap;
+        __u32           rn_suppgid1;
+        __u32           rn_suppgid2;
         struct ll_fid   rn_fid1;
         struct ll_fid   rn_fid2;
 };
index 36cd54f..4fc2581 100644 (file)
 #ifndef __IMPORT_H
 #define __IMPORT_H
 
-#ifdef __KERNEL__
 
 #define IMP_INVALID       1
 #define IMP_REPLAYABLE    2
 
-typedef int (*import_recover_t)(struct obd_import *imp, int phase);
 
+struct obd_import;
+typedef int (*import_recover_t)(struct obd_import *imp, int phase);
 #include <linux/lustre_idl.h>
 
 struct obd_import {
@@ -36,18 +36,15 @@ struct obd_import {
         struct obd_device        *imp_obd;
         int                       imp_flags;
         int                       imp_level;
-        __u64                     imp_last_xid;
-        __u64                     imp_last_bulk_xid;
         __u64                     imp_max_transno;
         __u64                     imp_peer_committed_transno;
 
-        /* Protects flags, level, last_xid, *_list */
+        /* Protects flags, level, *_list */
         spinlock_t                imp_lock;
 };
 
 extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
 extern struct obd_import *class_conn2ldlmimp(struct lustre_handle *);
 
-#endif /* __KERNEL__ */
 
 #endif /* __IMPORT_H */
index 54750c0..41c67ff 100644 (file)
 #endif
 #endif
 
-#ifdef __KERNEL__
-/* l_net.c */
+/* target.c */
 struct ptlrpc_request;
 struct obd_device;
 struct recovd_data;
 struct recovd_obd;
 struct obd_export;
 #include <linux/lustre_ha.h>
+#include <linux/lustre_net.h>
+
 
-int target_handle_connect(struct ptlrpc_request *req);
+int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler);
 int target_handle_disconnect(struct ptlrpc_request *req);
 int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
                             struct obd_uuid *cluuid);
+int target_revoke_connection(struct recovd_data *rd, int phase);
+
+#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
+void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler);
+void target_abort_recovery(void *data);
+int target_queue_recovery_request(struct ptlrpc_request *req,
+                                  struct obd_device *obd);
+int target_queue_final_reply(struct ptlrpc_request *req, int rc);
+
+/* client.c */
 int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd,
                        struct obd_uuid *cluuid, struct recovd_obd *recovd,
                        ptlrpc_recovery_cb_t recover);
 int client_obd_disconnect(struct lustre_handle *conn);
 int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf);
+int client_sanobd_setup(struct obd_device *obddev, obd_count len, void *buf);
 int client_obd_cleanup(struct obd_device * obddev);
 struct client_obd *client_conn2cli(struct lustre_handle *conn);
 struct obd_device *client_tgtuuid2obd(struct obd_uuid *tgtuuid);
 
-int target_revoke_connection(struct recovd_data *rd, int phase);
-
+/* statfs_pack.c */
 int obd_self_statfs(struct obd_device *dev, struct statfs *sfs);
 
 /* l_lock.c */
@@ -113,6 +124,8 @@ int lustre_fread(struct file *file, char *str, int len, loff_t *off);
 int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off);
 int lustre_fsync(struct file *file);
 
+#ifdef __KERNEL__
+
 static inline void l_dput(struct dentry *de)
 {
         if (!de || IS_ERR(de))
@@ -148,7 +161,7 @@ static inline void ldlm_object2handle(void *object, struct lustre_handle *handle
 /*
  *   OBD IOCTLS
  */
-#define OBD_IOCTL_VERSION 0x00010001
+#define OBD_IOCTL_VERSION 0x00010002
 
 struct obd_ioctl_data {
         uint32_t ioc_len;
@@ -165,7 +178,10 @@ struct obd_ioctl_data {
         obd_size         ioc_count;
         obd_off          ioc_offset;
         uint32_t         ioc_dev;
-        uint32_t         ____padding;
+        uint32_t         ioc_command;
+
+        uint64_t ioc_nid;
+        uint32_t ioc_nal;
 
         /* buffers the kernel will treat as user pointers */
         uint32_t ioc_plen1;
@@ -260,8 +276,9 @@ static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
                 printk("OBD ioctl: plen2 set but NULL pointer\n");
                 return 1;
         }
-        if (obd_ioctl_packlen(data) != data->ioc_len ) {
-                printk("OBD ioctl: packlen exceeds ioc_len\n");
+        if (obd_ioctl_packlen(data) != data->ioc_len) {
+                printk("OBD ioctl: packlen exceeds ioc_len (%d != %d)\n",
+                       obd_ioctl_packlen(data), data->ioc_len);
                 return 1;
         }
 #if 0
@@ -344,7 +361,7 @@ static inline int obd_ioctl_unpack(struct obd_ioctl_data *data, char *pbuf,
 
         return 0;
 }
-#else
+#endif
 
 #include <linux/obd_support.h>
 
@@ -415,7 +432,6 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
         EXIT;
         return 0;
 }
-#endif
 
 #define OBD_IOC_CREATE                 _IOR ('f', 101, long)
 #define OBD_IOC_SETUP                  _IOW ('f', 102, long)
@@ -464,12 +480,34 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
 
 #define OBD_GET_VERSION                _IOWR ('f', 144, long)
 
+#define OBD_IOC_ADD_UUID               _IOWR ('f', 145, long)
+#define OBD_IOC_DEL_UUID               _IOWR ('f', 146, long)
+#define OBD_IOC_CLOSE_UUID             _IOWR ('f', 147, long)
+
 #define ECHO_IOC_GET_STRIPE            _IOWR('f', 200, long)
 #define ECHO_IOC_SET_STRIPE            _IOWR('f', 201, long)
 #define ECHO_IOC_ENQUEUE               _IOWR('f', 202, long)
 #define ECHO_IOC_CANCEL                _IOWR('f', 203, long)
 
 
+#define CHECKSUM_BULK 0
+
+#if CHECKSUM_BULK
+static inline void ost_checksum(__u64 *cksum, void *addr, int len)
+{
+        unsigned char *ptr = (unsigned char *)addr;
+        __u64          sum = 0;
+
+        /* very stupid, but means I don't have to think about byte order */
+        while (len-- > 0)
+                sum += *ptr++;
+
+        *cksum = (*cksum << 2) + sum;
+}
+#else
+#define ost_checksum(cksum, addr, len) do {} while (0)
+#endif
+
 /*
  * l_wait_event is a flexible sleeping function, permitting simple caller
  * configuration of interrupt and timeout sensitivity along with actions to
@@ -537,11 +575,17 @@ struct l_wait_info {
         lwi_cb_data:    data                                                   \
 })
 
+#ifdef __KERNEL__
+#define l_sigismember sigismember
+#else
+#define l_sigismember(a,b) (*(a) & b)
+#endif
+
 /* XXX this should be one mask-check */
 #define l_killable_pending(task)                                               \
-(sigismember(&(task->pending.signal), SIGKILL) ||                              \
- sigismember(&(task->pending.signal), SIGINT) ||                               \
- sigismember(&(task->pending.signal), SIGTERM))
+(l_sigismember(&(task->pending.signal), SIGKILL) ||                              \
l_sigismember(&(task->pending.signal), SIGINT) ||                               \
l_sigismember(&(task->pending.signal), SIGTERM))
 
 #define __l_wait_event(wq, condition, info, ret)                               \
 do {                                                                           \
index 0c56fcd..0c29c79 100644 (file)
 #include <linux/lustre_mds.h>
 #include <linux/lustre_ha.h>
 
+
 extern kmem_cache_t *ll_file_data_slab;
 struct ll_file_data {
         struct lustre_handle fd_mdshandle;
-        struct lustre_handle fd_osthandle;
         struct ptlrpc_request *fd_req;
+        char fd_ostdata[FD_OSTDATA_SIZE];
         __u32 fd_flags;
 };
 
@@ -51,12 +52,10 @@ struct ll_read_inode2_cookie {
         struct lov_mds_md *lic_lmm;
 };
 
-#define LL_INLINESZ      60
 struct ll_inode_info {
         struct lov_stripe_md *lli_smd;
         char                 *lli_symlink_name;
         struct semaphore      lli_open_sem;
-        atomic_t              lli_open_count; /* see ll_file_release */
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
         struct inode          lli_vfs_inode;
 #endif
@@ -245,13 +244,13 @@ extern struct file_operations ll_file_operations;
 extern struct inode_operations ll_file_inode_operations;
 extern struct inode_operations ll_special_inode_operations;
 struct ldlm_lock;
-int ll_lock_callback(struct ldlm_lock *, struct ldlm_lock_desc *, void *data, int flag);
+int ll_lock_callback(struct ldlm_lock *, struct ldlm_lock_desc *, void *data,
+                     int flag);
 int ll_size_lock(struct inode *, struct lov_stripe_md *, obd_off start,
                  int mode, struct lustre_handle *);
 int ll_size_unlock(struct inode *, struct lov_stripe_md *, int mode,
                    struct lustre_handle *);
-int ll_file_size(struct inode *inode, struct lov_stripe_md *md,
-                 struct lustre_handle *);
+int ll_file_size(struct inode *inode, struct lov_stripe_md *md, char *ostdata);
 int ll_create_objects(struct super_block *sb, obd_id id, uid_t uid,
                       gid_t gid, struct lov_stripe_md **lsmp);
 
@@ -262,6 +261,7 @@ void ll_truncate(struct inode *inode);
 
 /* super.c */
 void ll_update_inode(struct inode *, struct mds_body *, struct lov_mds_md *);
+int ll_setattr_raw(struct inode *inode, struct iattr *attr);
 
 /* symlink.c */
 extern struct inode_operations ll_fast_symlink_inode_operations;
index 133f7af..0a881b1 100644 (file)
@@ -27,8 +27,8 @@
 #define _LUSTRE_MDS_H
 
 #ifdef __KERNEL__
-
 #include <linux/fs.h>
+#endif
 #include <linux/kp30.h>
 #include <linux/lustre_idl.h>
 
@@ -45,11 +45,12 @@ struct ll_file_data;
 #define LUSTRE_MDT_NAME "mdt"
 #define LUSTRE_MDC_NAME "mdc"
 
-struct mdc_rpc_lock { 
+struct mdc_rpc_lock {
         struct semaphore rpcl_sem;
         struct lookup_intent *rpcl_it;
 };
 extern struct mdc_rpc_lock mdc_rpc_lock;
+extern struct mdc_rpc_lock mdc_setattr_lock;
 
 static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
 {
@@ -108,7 +109,8 @@ struct mds_update_record {
         __u32 ur_gid;
         __u64 ur_time;
         __u32 ur_flags;
-        __u32 ur_suppgid;
+        __u32 ur_suppgid1;
+        __u32 ur_suppgid2;
 };
 
 #define MDS_LR_CLIENT  8192
@@ -117,13 +119,12 @@ struct mds_update_record {
 #define MDS_CLIENT_SLOTS 17
 
 #define MDS_MOUNT_RECOV 2
-#define MDS_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
 
 /* Data stored per server at the head of the last_rcvd file.  In le32 order. */
 struct mds_server_data {
         __u8 msd_uuid[37];      /* server UUID */
         __u8 uuid_padding[3];   /* unused */
-        __u64 msd_last_rcvd;    /* last completed transaction ID */
+        __u64 msd_last_transno; /* last completed transaction ID */
         __u64 msd_mount_count;  /* MDS incarnation number */
         __u8 padding[512 - 56];
 };
@@ -132,10 +133,12 @@ struct mds_server_data {
 struct mds_client_data {
         __u8 mcd_uuid[37];      /* client UUID */
         __u8 uuid_padding[3];   /* unused */
-        __u64 mcd_last_rcvd;    /* last completed transaction ID */
         __u64 mcd_mount_count;  /* MDS incarnation number */
-        __u64 mcd_last_xid;     /* client RPC xid for the last transaction */
-        __u8 padding[MDS_LR_SIZE - 64];
+        __u64 mcd_last_transno; /* last completed transaction ID */
+        __u64 mcd_last_xid;     /* xid for the last transaction */
+        __u32 mcd_last_result;  /* result from last RPC */
+        __u32 mcd_last_data;    /* per-op data (disposition for open &c.) */
+        __u8 padding[MDS_LR_SIZE - 58];
 };
 
 /* In-memory access to client data from MDS struct */
@@ -144,15 +147,14 @@ struct mds_export_data {
         spinlock_t              med_open_lock;
         struct mds_client_data *med_mcd;
         int                     med_off;
-        __u64                   med_last_xid;
-        struct lustre_msg      *med_last_reply;
-        int                     med_last_replen;
+        struct ptlrpc_request  *med_outstanding_reply;
 };
 
 /* file data for open files on MDS */
 struct mds_file_data {
         struct list_head     mfd_list;
         __u64                mfd_servercookie;
+        __u64                mfd_xid;
         struct file         *mfd_file;
 };
 
@@ -212,6 +214,8 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
 int mds_reint(struct ptlrpc_request *req, int offset, struct lustre_handle *);
 int mds_pack_md(struct obd_device *mds, struct lustre_msg *msg,
                 int offset, struct mds_body *body, struct inode *inode);
+void mds_steal_ack_locks(struct mds_export_data *med,
+                         struct ptlrpc_request *req);
 
 /* mds/mds_fs.c */
 int mds_fs_setup(struct obd_device *obddev, struct vfsmount *mnt);
@@ -268,7 +272,6 @@ int mds_client_add(struct mds_obd *mds, struct mds_export_data *med,
                    int cl_off);
 int mds_client_free(struct obd_export *exp);
 
-#endif /* __KERNEL__ */
 
 /* ioctls for trying requests */
 #define IOC_REQUEST_TYPE                   'f'
@@ -282,4 +285,18 @@ int mds_client_free(struct obd_export *exp);
 #define IOC_REQUEST_CLOSE               _IOWR('f', 35, long)
 #define IOC_REQUEST_MAX_NR               35
 
+#define MDS_CHECK_RESENT(req, reconstruct)                                     \
+{                                                                              \
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {               \
+                struct mds_client_data *mcd =                                  \
+                        req->rq_export->exp_mds_data.med_mcd;                  \
+                if (mcd->mcd_last_xid == req->rq_xid) {                        \
+                        reconstruct;                                           \
+                        RETURN(0);                                             \
+                }                                                              \
+                DEBUG_REQ(D_HA, req, "no reply for RESENT req (have "LPD64")", \
+                          mcd->mcd_last_xid);                                  \
+        }                                                                      \
+}
+
 #endif
index e2c9db3..8c50212 100644 (file)
 #ifndef _LUSTRE_NET_H
 #define _LUSTRE_NET_H
 
+#ifdef __KERNEL__
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #include <linux/tqueue.h>
+#else
+#include <linux/workqueue.h>
+#endif
+#endif 
+
 #include <linux/kp30.h>
 // #include <linux/obd.h>
 #include <portals/p30.h>
 
 #define CONN_INVALID 1
 
+struct ptlrpc_peer {
+        ptl_nid_t         peer_nid;
+        struct ptlrpc_ni *peer_ni;
+};
+
 struct ptlrpc_connection {
         struct list_head        c_link;
-        struct lustre_peer      c_peer;
+        struct ptlrpc_peer      c_peer;
         struct obd_uuid         c_local_uuid;  /* XXX do we need this? */
         struct obd_uuid         c_remote_uuid;
 
@@ -134,19 +147,21 @@ struct ptlrpc_client {
 };
 
 /* state flags of requests */
-#define PTL_RPC_FL_INTR      (1 << 0)
+#define PTL_RPC_FL_INTR      (1 << 0)  /* reply wait was interrupted by user */
 #define PTL_RPC_FL_REPLIED   (1 << 1)  /* reply was received */
-#define PTL_RPC_FL_SENT      (1 << 2)
-#define PTL_BULK_FL_SENT     (1 << 3)
-#define PTL_BULK_FL_RCVD     (1 << 4)
-#define PTL_RPC_FL_ERR       (1 << 5)
-#define PTL_RPC_FL_TIMEOUT   (1 << 6)
-#define PTL_RPC_FL_RESEND    (1 << 7)
-#define PTL_RPC_FL_RESTART   (1 << 8)  /* operation must be restarted */
-#define PTL_RPC_FL_FINISHED  (1 << 9)
+#define PTL_RPC_FL_SENT      (1 << 2)  /* request was sent */
+#define PTL_RPC_FL_WANT_ACK  (1 << 3)  /* reply is awaiting an ACK */
+#define PTL_BULK_FL_SENT     (1 << 4)  /* outgoing bulk was sent */
+#define PTL_BULK_FL_RCVD     (1 << 5)  /* incoming bulk was recieved */
+#define PTL_RPC_FL_ERR       (1 << 6)  /* request failed due to RPC error */
+#define PTL_RPC_FL_TIMEOUT   (1 << 7)  /* request timed out waiting for reply */
+#define PTL_RPC_FL_RESEND    (1 << 8)  /* retransmit the request */
+#define PTL_RPC_FL_RESTART   (1 << 9)  /* operation must be restarted */
 #define PTL_RPC_FL_RETAIN    (1 << 10) /* retain for replay after reply */
 #define PTL_RPC_FL_REPLAY    (1 << 11) /* replay upon recovery */
 #define PTL_RPC_FL_ALLOCREP  (1 << 12) /* reply buffer allocated */
+#define PTL_RPC_FL_NO_RESEND (1 << 13) /* don't automatically resend this req */
+#define PTL_RPC_FL_RESENT    (1 << 14) /* server rcvd resend of this req */
 
 struct ptlrpc_request {
         int rq_type; /* one of PTL_RPC_MSG_* */
@@ -168,8 +183,7 @@ struct ptlrpc_request {
         __u64 rq_xid;
 
         int rq_level;
-        //        void * rq_reply_handle;
-        wait_queue_head_t rq_wait_for_rep;
+        wait_queue_head_t rq_wait_for_rep; /* XXX also _for_ack */
 
         /* incoming reply */
         ptl_md_t rq_reply_md;
@@ -178,7 +192,7 @@ struct ptlrpc_request {
         /* outgoing req/rep */
         ptl_md_t rq_req_md;
 
-        struct lustre_peer rq_peer; /* XXX see service.c can this be factored away? */
+        struct ptlrpc_peer rq_peer; /* XXX see service.c can this be factored away? */
         struct obd_export *rq_export;
         struct ptlrpc_connection *rq_connection;
         struct obd_import *rq_import;
@@ -186,6 +200,12 @@ struct ptlrpc_request {
 
         void (*rq_replay_cb)(struct ptlrpc_request *);
         void  *rq_replay_data;
+
+        /* Only used on the server side for tracking acks. */
+        struct ptlrpc_req_ack_lock {
+                struct lustre_handle lock;
+                __u32                mode;
+        } rq_ack_locks[4];
 };
 
 #define DEBUG_REQ(level, req, fmt, args...)                                    \
@@ -259,41 +279,57 @@ struct ptlrpc_thread {
 
 struct ptlrpc_request_buffer_desc {
         struct list_head       rqbd_list;
-        struct ptlrpc_service *rqbd_service;
+        struct ptlrpc_srv_ni  *rqbd_srv_ni;
         ptl_handle_me_t        rqbd_me_h;
         atomic_t               rqbd_refcount;
         char                  *rqbd_buffer;
 };
 
+struct ptlrpc_ni {
+        /* Generic interface state */
+        char                   *pni_name;
+        ptl_handle_ni_t         pni_ni_h;
+        ptl_handle_eq_t         pni_request_out_eq_h;
+        ptl_handle_eq_t         pni_reply_in_eq_h;
+        ptl_handle_eq_t         pni_reply_out_eq_h;
+        ptl_handle_eq_t         pni_bulk_put_source_eq_h;
+        ptl_handle_eq_t         pni_bulk_put_sink_eq_h;
+        ptl_handle_eq_t         pni_bulk_get_source_eq_h;
+        ptl_handle_eq_t         pni_bulk_get_sink_eq_h;
+};
+
+struct ptlrpc_srv_ni {
+        /* Interface-specific service state */
+        struct ptlrpc_service  *sni_service;    /* owning service */
+        struct ptlrpc_ni       *sni_ni;         /* network interface */
+        ptl_handle_eq_t         sni_eq_h;       /* event queue handle */
+        struct list_head        sni_rqbds;      /* all the request buffer descriptors */
+        __u32                   sni_nrqbds;     /* # request buffers */
+        atomic_t                sni_nrqbds_receiving; /* # request buffers posted */
+};
+
 struct ptlrpc_service {
         time_t srv_time;
         time_t srv_timeout;
 
-        /* incoming request buffers */
-        /* FIXME: perhaps a list of EQs, if multiple NIs are used? */
-
+        struct list_head srv_ni_list;          /* list of interfaces */
         __u32            srv_max_req_size;     /* biggest request to receive */
         __u32            srv_buf_size;         /* # bytes in a request buffer */
-        struct list_head srv_rqbds;            /* all the request buffer descriptors */
-        __u32            srv_nrqbds;           /* # request buffers */
-        atomic_t         srv_nrqbds_receiving; /* # request buffers posted for input */
 
         __u32 srv_req_portal;
         __u32 srv_rep_portal;
 
         __u32 srv_xid;
 
-        /* event queue */
-        ptl_handle_eq_t srv_eq_h;
-
-        struct lustre_peer srv_self;
-
         wait_queue_head_t srv_waitq; /* all threads sleep on this */
 
         spinlock_t srv_lock;
         struct list_head srv_threads;
         int (*srv_handler)(struct ptlrpc_request *req);
         char *srv_name;  /* only statically allocated strings here; we don't clean them */
+
+        int                  srv_interface_rover;
+        struct ptlrpc_srv_ni srv_interfaces[0];
 };
 
 static inline void ptlrpc_hdl2req(struct ptlrpc_request *req,
@@ -307,9 +343,14 @@ typedef void (*bulk_callback_t)(struct ptlrpc_bulk_desc *, void *);
 
 typedef int (*svc_handler_t)(struct ptlrpc_request *req);
 
+/* rpc/events.c */
+extern struct ptlrpc_ni ptlrpc_interfaces[];
+extern int              ptlrpc_ninterfaces;
+extern int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer);
+
 /* rpc/connection.c */
 void ptlrpc_readdress_connection(struct ptlrpc_connection *, struct obd_uuid *uuid);
-struct ptlrpc_connection *ptlrpc_get_connection(struct lustre_peer *peer,
+struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
                                                 struct obd_uuid *uuid);
 int ptlrpc_put_connection(struct ptlrpc_connection *c);
 struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
@@ -326,6 +367,7 @@ int ptlrpc_register_bulk_get(struct ptlrpc_bulk_desc *);
 int ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk);
 struct obd_brw_set *obd_brw_set_new(void);
 void obd_brw_set_add(struct obd_brw_set *, struct ptlrpc_bulk_desc *);
+void obd_brw_set_del(struct ptlrpc_bulk_desc *);
 void obd_brw_set_free(struct obd_brw_set *);
 
 int ptlrpc_reply(struct ptlrpc_service *svc, struct ptlrpc_request *req);
@@ -346,6 +388,7 @@ int ll_brw_sync_wait(struct obd_brw_set *, int phase);
 int ptlrpc_queue_wait(struct ptlrpc_request *req);
 void ptlrpc_continue_req(struct ptlrpc_request *req);
 int ptlrpc_replay_req(struct ptlrpc_request *req);
+int ptlrpc_abort(struct ptlrpc_request *req);
 void ptlrpc_restart_req(struct ptlrpc_request *req);
 void ptlrpc_abort_inflight(struct obd_import *imp, int dying_import);
 
@@ -364,8 +407,7 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
 /* rpc/service.c */
 struct ptlrpc_service *
 ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size,
-                int req_portal, int rep_portal,
-                struct obd_uuid *uuid, svc_handler_t, char *name);
+                int req_portal, int rep_portal, svc_handler_t, char *name);
 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
 int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
                         char *name);
@@ -385,6 +427,9 @@ int lustre_msg_size(int count, int *lengths);
 int lustre_unpack_msg(struct lustre_msg *m, int len);
 void *lustre_msg_buf(struct lustre_msg *m, int n);
 
+/* rpc/rpc.c */
+__u32 ptlrpc_next_xid(void);
+
 static inline void ptlrpc_bulk_decref(struct ptlrpc_bulk_desc *desc)
 {
         CDEBUG(D_PAGE, "%p -> %d\n", desc, atomic_read(&desc->bd_refcount) - 1);
index acc59c2..f3163fe 100644 (file)
@@ -38,6 +38,8 @@ struct lov_stripe_md {
 #ifdef __KERNEL__
 # include <linux/fs.h>
 # include <linux/list.h>
+# include <linux/sched.h> /* for struct task_struct, for current.h */
+# include <asm/current.h> /* for smp_lock.h */
 # include <linux/smp_lock.h>
 # include <linux/proc_fs.h>
 
@@ -45,6 +47,7 @@ struct lov_stripe_md {
 # include <linux/lustre_idl.h>
 # include <linux/lustre_mds.h>
 # include <linux/lustre_export.h>
+#endif
 
 struct obd_type {
         struct list_head typ_chain;
@@ -72,7 +75,8 @@ struct obd_ucred {
         __u32 ouc_fsuid;
         __u32 ouc_fsgid;
         __u32 ouc_cap;
-        __u32 ouc_suppgid;
+        __u32 ouc_suppgid1;
+        __u32 ouc_suppgid2;
 };
 
 #define OBD_RUN_CTXT_MAGIC      0xC0FFEEAA
@@ -98,6 +102,12 @@ struct obd_run_ctxt {
 
 struct ost_server_data;
 
+#define FILTER_TRANSNO_SEM
+
+#ifndef OST_RECOVERY
+#undef FILTER_TRANSNO_SEM
+#endif
+
 struct filter_obd {
         char *fo_fstype;
         struct super_block *fo_sb;
@@ -105,18 +115,23 @@ struct filter_obd {
         struct obd_run_ctxt fo_ctxt;
         struct dentry *fo_dentry_O;
         struct dentry *fo_dentry_O_mode[16];
+        struct dentry **fo_dentry_O_sub;
         spinlock_t fo_objidlock;        /* protects fo_lastobjid increment */
+#ifdef FILTER_TRANSNO_SEM
         struct semaphore fo_transno_sem;
+#else
+        spinlock_t fo_translock;        /* protects fsd_last_rcvd increment */
+#endif
         struct file *fo_rcvd_filp;
         struct filter_server_data *fo_fsd;
+        unsigned long *fo_last_rcvd_slots;
 
-        __u64 fo_next_recovery_transno;
-        int   fo_recoverable_clients;
         struct file_operations *fo_fop;
         struct inode_operations *fo_iop;
         struct address_space_operations *fo_aops;
         struct list_head fo_export_list;
         spinlock_t fo_fddlock;          /* protects setting dentry->d_fsdata */
+        int fo_subdir_count;
 };
 
 struct mds_server_data;
@@ -130,11 +145,13 @@ struct client_obd {
          * call obd_size_wiremd() all the time. */
         int                  cl_max_mds_easize;
         struct obd_device   *cl_containing_lov;
+        kdev_t               cl_sandev;
 };
 
 struct mds_obd {
         struct ptlrpc_service           *mds_service;
-        struct ptlrpc_service           *mds_getattr_service;
+        struct ptlrpc_service           *mds_setattr_service;
+        struct ptlrpc_service           *mds_readpage_service;
 
         struct super_block              *mds_sb;
         struct vfsmount                 *mds_vfsmnt;
@@ -145,21 +162,12 @@ struct mds_obd {
 
         int                              mds_max_mdsize;
         struct file                     *mds_rcvd_filp;
-        struct semaphore                 mds_transno_sem;
-        __u64                            mds_last_rcvd;
+        spinlock_t                       mds_transno_lock;
+        __u64                            mds_last_transno;
         __u64                            mds_mount_count;
         struct ll_fid                    mds_rootfid;
         struct mds_server_data          *mds_server_data;
 
-        wait_queue_head_t                mds_next_transno_waitq;
-        __u64                            mds_next_recovery_transno;
-        int                              mds_recoverable_clients;
-        struct list_head                 mds_recovery_queue;
-        struct list_head                 mds_delayed_reply_queue;
-        spinlock_t                       mds_processing_task_lock;
-        pid_t                            mds_processing_task;
-        struct timer_list                mds_recovery_timer;
-        
         int                              mds_has_lov_desc;
         struct lov_desc                  mds_lov_desc;
 };
@@ -267,12 +275,13 @@ struct niobuf_local {
         struct dentry *dentry;
 };
 
+/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */
+#define N_LOCAL_TEMP_PAGE 0x10000000
+
 struct obd_trans_info {
         __u64     oti_transno;
 };
 
-#define N_LOCAL_TEMP_PAGE 0x00000001
-
 /* corresponds to one of the obd's */
 struct obd_device {
         struct obd_type *obd_type;
@@ -292,6 +301,18 @@ struct obd_device {
         spinlock_t obd_dev_lock;
         __u64                  obd_last_committed;
         struct fsfilt_operations *obd_fsops;
+
+        /* XXX encapsulate all this recovery data into one struct */
+        svc_handler_t                    obd_recovery_handler;
+        int                              obd_recoverable_clients;
+        spinlock_t                       obd_processing_task_lock;
+        pid_t                            obd_processing_task;
+        __u64                            obd_next_recovery_transno;
+        wait_queue_head_t                obd_next_transno_waitq;
+        struct timer_list                obd_recovery_timer;
+        struct list_head                 obd_recovery_queue;
+        struct list_head                 obd_delayed_reply_queue;
+
         union {
                 struct ext2_obd ext2;
                 struct filter_obd filter;
@@ -334,6 +355,7 @@ struct obd_ops {
 
 
         int (*o_statfs)(struct lustre_handle *conn, struct obd_statfs *osfs);
+        int (*o_syncfs)(struct lustre_handle *conn);
         int (*o_packmd)(struct lustre_handle *, struct lov_mds_md **wire_tgt,
                         struct lov_stripe_md *mem_src);
         int (*o_unpackmd)(struct lustre_handle *,
@@ -355,7 +377,7 @@ struct obd_ops {
                        struct lov_stripe_md *ea, struct obd_trans_info *oti);
         int (*o_brw)(int rw, struct lustre_handle *conn,
                      struct lov_stripe_md *ea, obd_count oa_bufs,
-                     struct brw_page *pgarr, struct obd_brw_set *, 
+                     struct brw_page *pgarr, struct obd_brw_set *,
                      struct obd_trans_info *oti);
         int (*o_punch)(struct lustre_handle *conn, struct obdo *tgt,
                        struct lov_stripe_md *ea, obd_size count,
@@ -388,6 +410,8 @@ struct obd_ops {
                         __u32 mode, struct lustre_handle *);
         int (*o_cancel_unused)(struct lustre_handle *, struct lov_stripe_md *,
                                int local_only);
+        int (*o_san_preprw)(int cmd, struct lustre_handle *conn,
+                            int objcount, struct obd_ioobj *obj,
+                            int niocount, struct niobuf_remote *remote);
 };
-#endif /* __KERNEL */
 #endif /* __OBD_H */
index 8e160ad..f626bab 100644 (file)
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/time.h>
+#endif 
 
 #include <linux/obd_support.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_net.h>
 #include <linux/obd.h>
 #include <linux/lustre_lib.h>
 #include <linux/lustre_idl.h>
 #include <linux/lustre_mds.h>
 #include <linux/lustre_dlm.h>
 #include <linux/lprocfs_status.h>
-#endif
+
 
 /* OBD Device Declarations */
 #define MAX_OBD_DEVICES 128
@@ -56,8 +59,9 @@ extern struct obd_device obd_dev[MAX_OBD_DEVICES];
 #define OBD_NO_TRANSNO     0x20 /* XXX needs better name */
 
 /* OBD Operations Declarations */
+extern struct obd_device *class_conn2obd(struct lustre_handle *);
+extern struct obd_export *class_conn2export(struct lustre_handle *);
 
-#ifdef __KERNEL__
 static inline int obd_check_conn(struct lustre_handle *conn)
 {
         struct obd_device *obd;
@@ -65,6 +69,7 @@ static inline int obd_check_conn(struct lustre_handle *conn)
                 CERROR("NULL conn\n");
                 RETURN(-ENOTCONN);
         }
+
         obd = class_conn2obd(conn);
         if (!obd) {
                 CERROR("NULL obd\n");
@@ -408,6 +413,19 @@ static inline int obd_statfs(struct lustre_handle *conn,struct obd_statfs *osfs)
         RETURN(rc);
 }
 
+static inline int obd_syncfs(struct lustre_handle *conn)
+{
+        struct obd_export *exp;
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_OP(exp->exp_obd, syncfs);
+
+        rc = OBP(exp->exp_obd, syncfs)(conn);
+        RETURN(rc);
+}
+
 static inline int obd_punch(struct lustre_handle *conn, struct obdo *oa,
                             struct lov_stripe_md *ea, obd_size start,
                             obd_size end, struct obd_trans_info *oti)
@@ -542,7 +560,21 @@ static inline int obd_cancel_unused(struct lustre_handle *conn,
         RETURN(rc);
 }
 
-#endif
+static inline int obd_san_preprw(int cmd, struct lustre_handle *conn,
+                                 int objcount, struct obd_ioobj *obj,
+                                 int niocount, struct niobuf_remote *remote)
+{
+        struct obd_export *exp;
+        int rc;
+
+        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_OP(exp->exp_obd, preprw);
+
+        rc = OBP(exp->exp_obd, san_preprw)(cmd, conn, objcount, obj,
+                                           niocount, remote);
+        RETURN(rc);
+}
+
 
 /* OBD Metadata Support */
 
@@ -554,24 +586,6 @@ static inline struct lustre_handle *obdo_handle(struct obdo *oa)
         return (struct lustre_handle *)&oa->o_inline;
 }
 
-static inline void obd_oa2handle(struct lustre_handle *handle, struct obdo *oa)
-{
-        if (oa->o_valid |= OBD_MD_FLHANDLE) {
-                struct lustre_handle *oa_handle = obdo_handle(oa);
-                memcpy(handle, oa_handle, sizeof(*handle));
-        }
-}
-
-static inline void obd_handle2oa(struct obdo *oa, struct lustre_handle *handle)
-{
-        if (handle && handle->addr) {
-                struct lustre_handle *oa_handle = obdo_handle(oa);
-                memcpy(oa_handle, handle, sizeof(*handle));
-                oa->o_valid |= OBD_MD_FLHANDLE;
-        }
-}
-
-#ifdef __KERNEL__
 /* support routines */
 extern kmem_cache_t *obdo_cachep;
 static inline struct obdo *obdo_alloc(void)
@@ -593,10 +607,12 @@ static inline void obdo_free(struct obdo *oa)
         kmem_cache_free(obdo_cachep, oa);
 }
 
+#ifdef __KERNEL__
 static inline void obdo_from_iattr(struct obdo *oa, struct iattr *attr)
 {
         unsigned int ia_valid = attr->ia_valid;
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         if (ia_valid & ATTR_ATIME) {
                 oa->o_atime = attr->ia_atime;
                 oa->o_valid |= OBD_MD_FLATIME;
@@ -609,6 +625,21 @@ static inline void obdo_from_iattr(struct obdo *oa, struct iattr *attr)
                 oa->o_ctime = attr->ia_ctime;
                 oa->o_valid |= OBD_MD_FLCTIME;
         }
+#else
+        if (ia_valid & ATTR_ATIME) {
+                oa->o_atime = attr->ia_atime.tv_sec;
+                oa->o_valid |= OBD_MD_FLATIME;
+        }
+        if (ia_valid & ATTR_MTIME) {
+                oa->o_mtime = attr->ia_mtime.tv_sec;
+                oa->o_valid |= OBD_MD_FLMTIME;
+        }
+        if (ia_valid & ATTR_CTIME) {
+                oa->o_ctime = attr->ia_ctime.tv_sec;
+                oa->o_valid |= OBD_MD_FLCTIME;
+        }
+#endif
+
         if (ia_valid & ATTR_SIZE) {
                 oa->o_size = attr->ia_size;
                 oa->o_valid |= OBD_MD_FLSIZE;
@@ -634,6 +665,7 @@ static inline void iattr_from_obdo(struct iattr *attr, struct obdo *oa,
                                    obd_flag valid)
 {
         memset(attr, 0, sizeof(*attr));
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         if (valid & OBD_MD_FLATIME) {
                 attr->ia_atime = oa->o_atime;
                 attr->ia_valid |= ATTR_ATIME;
@@ -646,6 +678,20 @@ static inline void iattr_from_obdo(struct iattr *attr, struct obdo *oa,
                 attr->ia_ctime = oa->o_ctime;
                 attr->ia_valid |= ATTR_CTIME;
         }
+#else
+        if (valid & OBD_MD_FLATIME) {
+                attr->ia_atime.tv_sec = oa->o_atime;
+                attr->ia_valid |= ATTR_ATIME;
+        }
+        if (valid & OBD_MD_FLMTIME) {
+                attr->ia_mtime.tv_sec = oa->o_mtime;
+                attr->ia_valid |= ATTR_MTIME;
+        }
+        if (valid & OBD_MD_FLCTIME) {
+                attr->ia_ctime.tv_sec = oa->o_ctime;
+                attr->ia_valid |= ATTR_CTIME;
+        }
+#endif
         if (valid & OBD_MD_FLSIZE) {
                 attr->ia_size = oa->o_size;
                 attr->ia_valid |= ATTR_SIZE;
@@ -683,12 +729,21 @@ static inline void iattr_from_obdo(struct iattr *attr, struct obdo *oa,
 static inline void obdo_from_inode(struct obdo *dst, struct inode *src,
                                    obd_flag valid)
 {
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         if (valid & OBD_MD_FLATIME)
                 dst->o_atime = src->i_atime;
         if (valid & OBD_MD_FLMTIME)
                 dst->o_mtime = src->i_mtime;
         if (valid & OBD_MD_FLCTIME)
                 dst->o_ctime = src->i_ctime;
+#else
+        if (valid & OBD_MD_FLATIME)
+                dst->o_atime = src->i_atime.tv_sec;
+        if (valid & OBD_MD_FLMTIME)
+                dst->o_mtime = src->i_mtime.tv_sec;
+        if (valid & OBD_MD_FLCTIME)
+                dst->o_ctime = src->i_ctime.tv_sec;
+#endif
         if (valid & OBD_MD_FLSIZE)
                 dst->o_size = src->i_size;
         if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
@@ -720,12 +775,21 @@ static inline void obdo_to_inode(struct inode *dst, struct obdo *src,
 {
         valid &= src->o_valid;
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         if (valid & OBD_MD_FLATIME)
                 dst->i_atime = src->o_atime;
         if (valid & OBD_MD_FLMTIME)
                 dst->i_mtime = src->o_mtime;
         if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime)
                 dst->i_ctime = src->o_ctime;
+#else
+        if (valid & OBD_MD_FLATIME)
+                dst->i_atime.tv_sec = src->o_atime;
+        if (valid & OBD_MD_FLMTIME)
+                dst->i_mtime.tv_sec = src->o_mtime;
+        if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime.tv_sec)
+                dst->i_ctime.tv_sec = src->o_ctime;
+#endif
         if (valid & OBD_MD_FLSIZE)
                 dst->i_size = src->o_size;
         if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
@@ -841,7 +905,6 @@ static inline int obdo_cmp_md(struct obdo *dst, struct obdo *src,
 }
 
 
-#ifdef __KERNEL__
 /* I'm as embarrassed about this as you are.
  *
  * <shaver> // XXX do not look into _superhack with remaining eye
@@ -895,7 +958,6 @@ struct obd_class_user_conn {
         struct lustre_handle   ocuc_conn;
 };
 
-#endif
 
 /* sysctl.c */
 extern void obd_sysctl_init (void);
@@ -905,4 +967,12 @@ extern void obd_sysctl_clean (void);
 typedef __u8 class_uuid_t[16];
 //int class_uuid_parse(struct obd_uuid in, class_uuid_t out);
 void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+
+/* lustre_peer.c    */
+int lustre_uuid_to_peer(char *uuid, struct lustre_peer *peer);
+int class_add_uuid(char *uuid, __u64 nid, __u32 nal);
+int class_del_uuid (char *uuid);
+void class_init_uuidlist(void);
+void class_exit_uuidlist(void);
+
 #endif /* __LINUX_OBD_CLASS_H */
index 16a4d03..26850d8 100644 (file)
@@ -32,6 +32,8 @@
 #define FILTER_LR_CLIENT_START   8192
 #define FILTER_LR_CLIENT_SIZE    128
 
+#define FILTER_SUBDIR_COUNT      32            /* set to zero for no subdirs */
+
 #define FILTER_MOUNT_RECOV 2
 #define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
 
 struct filter_server_data {
         __u8  fsd_uuid[37];        /* server UUID */
         __u8  fsd_uuid_padding[3]; /* unused */
-        __u64 fsd_last_objid;      /* last completed transaction ID */
+        __u64 fsd_last_objid;      /* last created object ID */
         __u64 fsd_last_rcvd;       /* last completed transaction ID */
         __u64 fsd_mount_count;     /* FILTER incarnation number */
-        __u8  fsd_padding[FILTER_LR_SERVER_SIZE - 64]; /*  */
+        __u32 fsd_feature_compat;  /* compatible feature flags */
+        __u32 fsd_feature_rocompat;/* read-only compatible feature flags */
+        __u32 fsd_feature_incompat;/* incompatible feature flags */
+        __u32 fsd_server_size;     /* size of server data area */
+        __u32 fsd_client_start;    /* start of per-client data area */
+        __u16 fsd_client_size;     /* size of per-client data area */
+        __u16 fsd_subdir_count;    /* number of subdirectories for objects */
+        __u8  fsd_padding[FILTER_LR_SERVER_SIZE - 88];
 };
 
 /* Data stored per client in the last_rcvd file.  In le32 order. */
@@ -52,15 +61,20 @@ struct filter_client_data {
         __u64 fcd_last_rcvd;       /* last completed transaction ID */
         __u64 fcd_mount_count;     /* FILTER incarnation number */
         __u64 fcd_last_xid;        /* client RPC xid for the last transaction */
-        __u8  fcd_padding[FILTER_LR_CLIENT_SIZE - 64]; 
+        __u8  fcd_padding[FILTER_LR_CLIENT_SIZE - 64];
 };
 
+#ifndef OBD_FILTER_SAN_DEVICENAME
+#define OBD_FILTER_SAN_DEVICENAME "sanobdfilter"
+#endif
+
 /* In-memory access to client data from OST struct */
 struct filter_export_data {
         struct list_head  fed_open_head; /* files to close on disconnect */
         spinlock_t        fed_lock;      /* protects fed_open_head */
         struct filter_client_data  *fed_fcd;
-        int               fed_lr_off;
+        loff_t            fed_lr_off;
+        int               fed_lr_idx;
 };
 
 /* file data for open files on OST */
@@ -71,6 +85,7 @@ struct filter_file_data {
 };
 
 struct filter_dentry_data {
+        obd_id           fdd_objid;
         atomic_t         fdd_open_count;
         int              fdd_flags;
 };
index acdac15..ff3e689 100644 (file)
@@ -5,8 +5,6 @@
 #ifndef _OBD_LOV_H__
 #define _OBD_LOV_H__
 
-#ifdef __KERNEL__
-
 #define OBD_LOV_DEVICENAME "lov"
 
 void lov_unpackdesc(struct lov_desc *ld);
@@ -16,7 +14,6 @@ static inline int lov_stripe_md_size(int stripes)
 {
         return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo);
 }
-#endif
 
 static inline int lov_mds_md_size(int stripes)
 {
index 5de0a25..f8d1486 100644 (file)
@@ -31,6 +31,8 @@
 
 #define LUSTRE_OST_NAME "ost"
 #define LUSTRE_OSC_NAME "osc"
+#define LUSTRE_SANOSC_NAME "sanosc"
+#define LUSTRE_SANOST_NAME "sanost"
 
 /* ost/ost_pack.c */
 void ost_pack_niobuf(void **tmp, __u64 offset, __u32 len, __u32 flags,
index 69e4126..a4d676d 100644 (file)
 #ifndef _OBD_SUPPORT
 #define _OBD_SUPPORT
 
+#ifdef __KERNEL__
 #include <linux/config.h>
 #include <linux/autoconf.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#else
+
+#endif
 #include <linux/kp30.h>
 
 /* global variables */
@@ -88,6 +92,7 @@ extern unsigned long obd_sync_filter;
 #define OBD_FAIL_OST_HANDLE_UNPACK       0x20d
 #define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
 #define OBD_FAIL_OST_BRW_READ_BULK       0x20f
+#define OBD_FAIL_OST_SYNCFS_NET          0x210
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
@@ -103,6 +108,9 @@ extern unsigned long obd_sync_filter;
 #define OBD_FAIL_OSC_LOCK_BL_AST         0x403
 #define OBD_FAIL_OSC_LOCK_CP_AST         0x404
 
+#define OBD_FAIL_PTLRPC                  0x500
+#define OBD_FAIL_PTLRPC_ACK              0x501
+
 /* preparation for a more advanced failure testbed (not functional yet) */
 #define OBD_FAIL_MASK_SYS    0x0000FF00
 #define OBD_FAIL_MASK_LOC    (0x000000FF | OBD_FAIL_MASK_SYS)
@@ -127,10 +135,12 @@ do {                                                                         \
         }                                                                    \
 } while(0)
 
+#define fixme() CDEBUG(D_OTHER, "FIXME\n");
+
+#ifdef __KERNEL__
 #include <linux/types.h>
 #include <linux/blkdev.h>
 
-#define fixme() CDEBUG(D_OTHER, "FIXME\n");
 
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
 #define ll_bdevname(a) __bdevname((a))
@@ -140,9 +150,11 @@ do {                                                                         \
 #define ll_bdevname(a) bdevname((a))
 #endif
 
+
 static inline void OBD_FAIL_WRITE(int id, kdev_t dev)
 {
         if (OBD_FAIL_CHECK(id)) {
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #ifdef CONFIG_DEV_RDONLY
                 CERROR("obd_fail_loc=%x, fail write operation on %s\n",
                        id, ll_bdevname(dev));
@@ -151,11 +163,23 @@ static inline void OBD_FAIL_WRITE(int id, kdev_t dev)
                 CERROR("obd_fail_loc=%x, can't fail write operation on %s\n",
                        id, ll_bdevname(dev));
 #endif
+#else
+#ifdef CONFIG_DEV_RDONLY
+                CERROR("obd_fail_loc=%x, fail write operation on %s\n",
+                       id, ll_bdevname(dev.value));
+                dev_set_rdonly(dev, 2);
+#else
+                CERROR("obd_fail_loc=%x, can't fail write operation on %s\n",
+                       id, ll_bdevname(dev.value));
+#endif
+#endif
                 /* We set FAIL_ONCE because we never "un-fail" a device */
                 obd_fail_loc |= OBD_FAILED | OBD_FAIL_ONCE;
         }
 }
 
+#endif  /* __KERNEL__ */
+
 #define OBD_ALLOC(ptr, size)                                            \
 do {                                                                    \
         void *lptr;                                                     \
@@ -177,9 +201,9 @@ do {                                                                    \
 } while (0)
 
 #ifdef CONFIG_DEBUG_SLAB
-#define POISON(lptr, s) do {} while (0)
+#define POISON(lptr, c, s) do {} while (0)
 #else
-#define POISON(lptr, s) memset(lptr, 0x5a, s)
+#define POISON(lptr, c, s) memset(lptr, c, s)
 #endif
 
 #define OBD_FREE(ptr, size)                                             \
@@ -187,7 +211,7 @@ do {                                                                    \
         void *lptr = (ptr);                                             \
         int s = (size);                                                 \
         LASSERT(lptr);                                                  \
-        POISON(lptr, s);                                                \
+        POISON(lptr, 0x5a, s);                                          \
         kfree(lptr);                                                    \
         atomic_sub(s, &obd_memory);                                     \
         CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
index 00eba97..8df0d82 100644 (file)
@@ -1,6 +1,14 @@
---- linux-chaos/fs/inode.c.b_io_export Wed Jan 29 16:56:15 2003
-+++ linux-chaos/fs/inode.c     Wed Jan 29 16:56:27 2003
-@@ -66,7 +66,8 @@
+--- linux/fs/inode.c.b_io      2003-02-18 16:39:16.000000000 -0800
++++ linux/fs/inode.c   2003-02-18 16:39:45.000000000 -0800
+@@ -5,6 +5,7 @@
+  */
+ #include <linux/config.h>
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/string.h>
+ #include <linux/mm.h>
+@@ -66,7 +67,8 @@
   * NOTE! You also have to own the lock if you change
   * the i_state of an inode while it is in use..
   */
@@ -10,8 +18,8 @@
  
  /*
   * Statistics gathering..
---- linux-chaos/fs/Makefile.b_io_export        Wed Jan 29 16:56:45 2003
-+++ linux-chaos/fs/Makefile    Wed Jan 29 16:56:53 2003
+--- linux/fs/Makefile.b_io     2003-02-18 16:39:16.000000000 -0800
++++ linux/fs/Makefile  2003-02-18 16:39:37.000000000 -0800
 @@ -7,7 +7,7 @@
  
  O_TARGET := fs.o
  mod-subdirs :=        nls
  
  obj-y :=      open.o read_write.o devices.o file_table.o buffer.o \
---- linux-chaos/mm/filemap.c.b_io_export       Wed Jan 29 16:50:39 2003
-+++ linux-chaos/mm/filemap.c   Wed Jan 29 16:51:11 2003
-@@ -65,6 +65,7 @@
-  *                    pagecache_lock
+--- linux/mm/vmscan.c.b_io     2003-02-18 16:39:16.000000000 -0800
++++ linux/mm/vmscan.c  2003-02-18 16:40:01.000000000 -0800
+@@ -14,6 +14,8 @@
+  *  Multiqueue VM started 5.8.00, Rik van Riel.
   */
- spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
-+EXPORT_SYMBOL(pagemap_lru_lock_cacheline);
  
- #define CLUSTER_PAGES         (1 << page_cluster)
- #define CLUSTER_OFFSET(x)     (((x) >> page_cluster) << page_cluster)
---- linux-chaos/mm/vmscan.c.b_io_export        Wed Jan 29 16:51:58 2003
-+++ linux-chaos/mm/vmscan.c    Wed Jan 29 16:55:16 2003
-@@ -839,6 +839,7 @@
++#include <linux/config.h>
++#include <linux/module.h>
+ #include <linux/slab.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
+@@ -837,6 +839,7 @@
        set_current_state(TASK_RUNNING);
        remove_wait_queue(&kswapd_done, &wait);
  }
  
  static void wakeup_memwaiters(void)
  {
---- linux-chaos/mm/Makefile.b_io_export        Wed Jan 29 16:52:46 2003
-+++ linux-chaos/mm/Makefile    Wed Jan 29 16:54:23 2003
+--- linux/mm/Makefile.b_io     2003-02-18 16:39:16.000000000 -0800
++++ linux/mm/Makefile  2003-02-18 16:39:37.000000000 -0800
 @@ -9,7 +9,7 @@
  
  O_TARGET := mm.o
  
 -export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o
-+export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o vmscan.c
++export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o vmscan.o
  
  obj-y  := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
            vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
index 5e88d35..71d372f 100644 (file)
@@ -1,31 +1,19 @@
-# This is a BitKeeper generated patch for the following project:
-# Project Name: Linux kernel tree
-# This patch format is intended for GNU patch command version 2.5 or higher.
-# This patch includes the following deltas:
-#                 ChangeSet    1.810   -> 1.811  
-#            kernel/ksyms.c    1.149   -> 1.150  
-#       fs/driverfs/inode.c    1.52    -> 1.53   
-#        include/linux/fs.h    1.175   -> 1.176  
-#      include/linux/namei.h   1.3     -> 1.4    
-#                fs/namei.c    1.56    -> 1.57   
-#             fs/nfsd/vfs.c    1.44    -> 1.45   
-#      arch/um/kernel/mem.c    1.5     -> 1.6    
-#        net/unix/af_unix.c    1.29    -> 1.30   
-#                 mm/slab.c    1.33    -> 1.34   
-#          fs/sysfs/inode.c    1.55    -> 1.56   
-#      include/linux/slab.h    1.13    -> 1.14   
-#      include/linux/dcache.h  1.19    -> 1.20   
-#
-# The following is the BitKeeper ChangeSet Log
-# --------------------------------------------
-# 02/10/20     braam@clusterfs.com     1.811
-# Changes for Lustre
-# --------------------------------------------
-#
-diff -Nru a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
---- a/arch/um/kernel/mem.c     Sun Dec  8 02:49:38 2002
-+++ b/arch/um/kernel/mem.c     Sun Dec  8 02:49:38 2002
-@@ -656,6 +656,22 @@
+ arch/um/kernel/mem.c   |   18 +++++++++++-
+ fs/namei.c             |   71 +++++++++++++++++++++++++++++++++++--------------
+ fs/nfsd/vfs.c          |    2 -
+ fs/sysfs/inode.c       |    2 -
+ include/linux/dcache.h |   27 ++++++++++++++++++
+ include/linux/fs.h     |   20 +++++++++++++
+ include/linux/namei.h  |    3 +-
+ include/linux/slab.h   |    1 
+ kernel/ksyms.c         |    7 ++++
+ mm/slab.c              |    5 +++
+ net/unix/af_unix.c     |    2 -
+ 11 files changed, 132 insertions(+), 26 deletions(-)
+
+--- linux-2.5.59/arch/um/kernel/mem.c~lustre-2.5       2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/arch/um/kernel/mem.c     2003-02-22 21:56:58.000000000 +0800
+@@ -639,6 +639,22 @@ struct page *pte_mem_map(pte_t pte)
        return(phys_mem_map(pte_val(pte)));
  }
  
@@ -48,7 +36,7 @@ diff -Nru a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
  struct mem_region *page_region(struct page *page, int *index_out)
  {
        int i;
-@@ -743,7 +759,7 @@
+@@ -726,7 +742,7 @@ extern unsigned long region_pa(void *vir
                   (addr <= region->start + region->len))
                        return(mk_phys(addr - region->start, i));
        }
@@ -57,22 +45,9 @@ diff -Nru a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
        return(0);
  }
  
-diff -Nru a/fs/driverfs/inode.c b/fs/driverfs/inode.c
---- a/fs/driverfs/inode.c      Sun Dec  8 02:49:38 2002
-+++ b/fs/driverfs/inode.c      Sun Dec  8 02:49:38 2002
-@@ -523,7 +523,7 @@
-       qstr.name = name;
-       qstr.len = strlen(name);
-       qstr.hash = full_name_hash(name,qstr.len);
--      return lookup_hash(&qstr,parent);
-+      return lookup_hash(&qstr,parent, NULL);
- }
- /**
-diff -Nru a/fs/namei.c b/fs/namei.c
---- a/fs/namei.c       Sun Dec  8 02:49:38 2002
-+++ b/fs/namei.c       Sun Dec  8 02:49:38 2002
-@@ -265,6 +265,9 @@
+--- linux-2.5.59/fs/namei.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/fs/namei.c       2003-02-22 21:56:58.000000000 +0800
+@@ -265,6 +265,9 @@ int deny_write_access(struct file * file
  
  void path_release(struct nameidata *nd)
  {
@@ -82,7 +57,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
        dput(nd->dentry);
        mntput(nd->mnt);
  }
-@@ -273,10 +276,18 @@
+@@ -273,10 +276,18 @@ void path_release(struct nameidata *nd)
   * Internal lookup() using the new generic dcache.
   * SMP-safe
   */
@@ -102,7 +77,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
        if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
                if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
                        dput(dentry);
-@@ -351,7 +362,7 @@
+@@ -351,7 +362,7 @@ ok:
   * make sure that nobody added the entry to the dcache in the meantime..
   * SMP-safe
   */
@@ -111,7 +86,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
  {
        struct dentry * result;
        struct inode *dir = parent->d_inode;
-@@ -369,7 +380,10 @@
+@@ -369,7 +380,10 @@ static struct dentry * real_lookup(struc
                struct dentry * dentry = d_alloc(parent, name);
                result = ERR_PTR(-ENOMEM);
                if (dentry) {
@@ -123,7 +98,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
                        if (result)
                                dput(dentry);
                        else {
-@@ -391,6 +405,12 @@
+@@ -391,6 +405,12 @@ static struct dentry * real_lookup(struc
                        dput(result);
                        result = ERR_PTR(-ENOENT);
                }
@@ -136,7 +111,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
        }
        return result;
  }
-@@ -534,7 +554,7 @@
+@@ -534,7 +554,7 @@ dcache_miss:
        unlock_nd(nd);
  
  need_lookup:
@@ -145,7 +120,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
        if (IS_ERR(dentry))
                goto fail;
        mntget(mnt);
-@@ -684,7 +704,7 @@
+@@ -684,7 +704,7 @@ int link_path_walk(const char * name, st
                        nd->dentry = next.dentry;
                }
                err = -ENOTDIR; 
@@ -154,7 +129,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
                        break;
                continue;
                /* here ends the main loop */
-@@ -737,7 +757,8 @@
+@@ -737,7 +757,8 @@ last_component:
                        break;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
@@ -164,7 +139,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
                                break;
                }
                goto return_base;
-@@ -886,7 +907,8 @@
+@@ -886,7 +907,8 @@ int path_lookup(const char *name, unsign
   * needs parent already locked. Doesn't follow mounts.
   * SMP-safe.
   */
@@ -174,7 +149,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
  {
        struct dentry * dentry;
        struct inode *inode;
-@@ -909,13 +931,16 @@
+@@ -909,13 +931,16 @@ struct dentry * lookup_hash(struct qstr 
                        goto out;
        }
  
@@ -192,8 +167,8 @@ diff -Nru a/fs/namei.c b/fs/namei.c
 +                        dentry = inode->i_op->lookup(inode, new);
                if (!dentry) {
                        dentry = new;
-                       security_ops->inode_post_lookup(inode, dentry);
-@@ -927,7 +952,7 @@
+                       security_inode_post_lookup(inode, dentry);
+@@ -927,7 +952,7 @@ out:
  }
  
  /* SMP-safe */
@@ -202,7 +177,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
  {
        unsigned long hash;
        struct qstr this;
-@@ -947,11 +972,16 @@
+@@ -947,11 +972,16 @@ struct dentry * lookup_one_len(const cha
        }
        this.hash = end_name_hash(hash);
  
@@ -220,7 +195,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
  /*
   *    namei()
   *
-@@ -1268,7 +1298,7 @@
+@@ -1268,7 +1298,7 @@ int open_namei(const char * pathname, in
  
        dir = nd->dentry;
        down(&dir->d_inode->i_sem);
@@ -229,7 +204,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
  
  do_last:
        error = PTR_ERR(dentry);
-@@ -1370,7 +1400,7 @@
+@@ -1371,7 +1401,7 @@ do_link:
        }
        dir = nd->dentry;
        down(&dir->d_inode->i_sem);
@@ -238,7 +213,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
        putname(nd->last.name);
        goto do_last;
  }
-@@ -1384,7 +1414,7 @@
+@@ -1385,7 +1415,7 @@ static struct dentry *lookup_create(stru
        dentry = ERR_PTR(-EEXIST);
        if (nd->last_type != LAST_NORM)
                goto fail;
@@ -247,7 +222,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
        if (IS_ERR(dentry))
                goto fail;
        if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1614,7 +1644,7 @@
+@@ -1617,7 +1647,7 @@ asmlinkage long sys_rmdir(const char * p
                        goto exit1;
        }
        down(&nd.dentry->d_inode->i_sem);
@@ -256,7 +231,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                error = vfs_rmdir(nd.dentry->d_inode, dentry);
-@@ -1675,7 +1705,7 @@
+@@ -1677,7 +1707,7 @@ asmlinkage long sys_unlink(const char * 
        if (nd.last_type != LAST_NORM)
                goto exit1;
        down(&nd.dentry->d_inode->i_sem);
@@ -265,7 +240,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                /* Why not before? Because we want correct error value */
-@@ -1949,7 +1979,8 @@
+@@ -1951,7 +1981,8 @@ int vfs_rename_other(struct inode *old_d
  }
  
  int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -275,7 +250,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
  {
        int error;
        int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
-@@ -2020,7 +2051,7 @@
+@@ -2022,7 +2053,7 @@ static inline int do_rename(const char *
  
        trap = lock_rename(new_dir, old_dir);
  
@@ -284,7 +259,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
-@@ -2040,7 +2071,7 @@
+@@ -2042,7 +2073,7 @@ static inline int do_rename(const char *
        error = -EINVAL;
        if (old_dentry == trap)
                goto exit4;
@@ -293,7 +268,7 @@ diff -Nru a/fs/namei.c b/fs/namei.c
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto exit4;
-@@ -2050,7 +2081,7 @@
+@@ -2052,7 +2083,7 @@ static inline int do_rename(const char *
                goto exit5;
  
        error = vfs_rename(old_dir->d_inode, old_dentry,
@@ -302,10 +277,9 @@ diff -Nru a/fs/namei.c b/fs/namei.c
  exit5:
        dput(new_dentry);
  exit4:
-diff -Nru a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
---- a/fs/nfsd/vfs.c    Sun Dec  8 02:49:38 2002
-+++ b/fs/nfsd/vfs.c    Sun Dec  8 02:49:38 2002
-@@ -1292,7 +1292,7 @@
+--- linux-2.5.59/fs/nfsd/vfs.c~lustre-2.5      2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/fs/nfsd/vfs.c    2003-02-22 21:56:58.000000000 +0800
+@@ -1337,7 +1337,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
                        err = nfserr_perm;
        } else
  #endif
@@ -314,10 +288,9 @@ diff -Nru a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
        if (!err && EX_ISSYNC(tfhp->fh_export)) {
                nfsd_sync_dir(tdentry);
                nfsd_sync_dir(fdentry);
-diff -Nru a/fs/sysfs/inode.c b/fs/sysfs/inode.c
---- a/fs/sysfs/inode.c Sun Dec  8 02:49:39 2002
-+++ b/fs/sysfs/inode.c Sun Dec  8 02:49:39 2002
-@@ -471,7 +471,7 @@
+--- linux-2.5.59/fs/sysfs/inode.c~lustre-2.5   2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/fs/sysfs/inode.c 2003-02-22 21:56:58.000000000 +0800
+@@ -539,7 +539,7 @@ static struct dentry * get_dentry(struct
        qstr.name = name;
        qstr.len = strlen(name);
        qstr.hash = full_name_hash(name,qstr.len);
@@ -325,36 +298,38 @@ diff -Nru a/fs/sysfs/inode.c b/fs/sysfs/inode.c
 +      return lookup_hash(&qstr,parent,NULL);
  }
  
- /**
-diff -Nru a/include/linux/dcache.h b/include/linux/dcache.h
---- a/include/linux/dcache.h   Sun Dec  8 02:49:39 2002
-+++ b/include/linux/dcache.h   Sun Dec  8 02:49:39 2002
-@@ -9,6 +9,24 @@
- #include <linux/spinlock.h>
- #include <asm/page.h>                 /* for BUG() */
-+#define IT_OPEN  (1)
-+#define IT_CREAT  (1<<1)
-+#define IT_MKDIR  (1<<2)
-+#define IT_LINK  (1<<3)
-+#define IT_LINK2  (1<<4)
-+#define IT_SYMLINK  (1<<5)
-+#define IT_UNLINK  (1<<6)
-+#define IT_RMDIR  (1<<7)
-+#define IT_RENAME  (1<<8)
-+#define IT_RENAME2  (1<<9)
-+#define IT_READDIR  (1<<10)
-+#define IT_GETATTR  (1<<11)
-+#define IT_SETATTR  (1<<12)
-+#define IT_READLINK  (1<<13)
-+#define IT_MKNOD  (1<<14)
-+#define IT_LOOKUP  (1<<15)
+--- linux-2.5.59/include/linux/dcache.h~lustre-2.5     2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/include/linux/dcache.h   2003-02-22 22:02:55.000000000 +0800
+@@ -11,6 +11,27 @@
+ struct vfsmount;
++#define IT_OPEN     (1)
++#define IT_CREAT    (1<<1)
++#define IT_READDIR  (1<<2)
++#define IT_GETATTR  (1<<3)
++#define IT_LOOKUP   (1<<4)
++#define IT_UNLINK   (1<<5)
++
++
++struct lookup_intent {
++       int it_op;
++       int it_mode;
++       int it_flags;
++       int it_disposition;
++       int it_status;
++       struct iattr *it_iattr;
++       __u64 it_lock_handle[2];
++       int it_lock_mode;
++       void *it_data;
++};
 +
 +
  /*
   * linux/include/linux/dcache.h
   *
-@@ -30,6 +48,8 @@
+@@ -32,6 +53,8 @@ struct qstr {
        unsigned int hash;
  };
  
@@ -363,7 +338,7 @@ diff -Nru a/include/linux/dcache.h b/include/linux/dcache.h
  struct dentry_stat_t {
        int nr_dentry;
        int nr_unused;
-@@ -79,6 +99,7 @@
+@@ -81,6 +104,7 @@ struct dentry {
        struct list_head d_subdirs;     /* our children */
        struct list_head d_alias;       /* inode alias list */
        int d_mounted;
@@ -371,7 +346,7 @@ diff -Nru a/include/linux/dcache.h b/include/linux/dcache.h
        struct qstr d_name;
        unsigned long d_time;           /* used by d_revalidate */
        struct dentry_operations  *d_op;
-@@ -96,6 +117,8 @@
+@@ -100,6 +124,8 @@ struct dentry_operations {
        int (*d_delete)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
@@ -380,10 +355,27 @@ diff -Nru a/include/linux/dcache.h b/include/linux/dcache.h
  };
  
  /* the dentry parameter passed to d_hash and d_compare is the parent
-diff -Nru a/include/linux/fs.h b/include/linux/fs.h
---- a/include/linux/fs.h       Sun Dec  8 02:49:38 2002
-+++ b/include/linux/fs.h       Sun Dec  8 02:49:38 2002
-@@ -700,7 +700,7 @@
+@@ -139,6 +165,7 @@ d_iput:            no              no              yes
+       */
+ #define DCACHE_REFERENCED     0x0008  /* Recently used, don't discard. */
++#define DCACHE_LUSTRE_INVALID         0x0010  /* Lustre invalidated */
+ extern spinlock_t dcache_lock;
+ extern rwlock_t dparent_lock;
+--- linux-2.5.59/include/linux/fs.h~lustre-2.5 2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/include/linux/fs.h       2003-02-22 22:52:58.000000000 +0800
+@@ -234,6 +234,9 @@ typedef int (get_blocks_t)(struct inode 
+ #define ATTR_ATTR_FLAG        1024
+ #define ATTR_KILL_SUID        2048
+ #define ATTR_KILL_SGID        4096
++#define ATTR_RAW              8192    /* file system, not vfs will massage attrs */
++#define ATTR_FROM_OPEN        16384    /* called from open path, ie O_TRUNC */
++
+ /*
+  * This is the Inode Attributes structure, used for notify_change().  It
+@@ -676,7 +679,7 @@ extern int vfs_symlink(struct inode *, s
  extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
  extern int vfs_rmdir(struct inode *, struct dentry *);
  extern int vfs_unlink(struct inode *, struct dentry *);
@@ -392,45 +384,51 @@ diff -Nru a/include/linux/fs.h b/include/linux/fs.h
  
  /*
   * File types
-@@ -769,6 +769,8 @@
+@@ -762,19 +765,33 @@ struct file_operations {
  struct inode_operations {
        int (*create) (struct inode *,struct dentry *,int);
        struct dentry * (*lookup) (struct inode *,struct dentry *);
 +      struct dentry * (*lookup2) (struct inode *,struct dentry *, 
 +                                    struct lookup_intent *);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
++      int (*link2) (struct inode *,struct inode *, const char *, int);
        int (*unlink) (struct inode *,struct dentry *);
++      int (*unlink2) (struct inode *, const char *, int);
        int (*symlink) (struct inode *,struct dentry *,const char *);
-@@ -995,6 +997,7 @@
++      int (*symlink2) (struct inode *, const char *, int, const char *);
+       int (*mkdir) (struct inode *,struct dentry *,int);
++      int (*mkdir2) (struct inode *, const char *, int,int);
+       int (*rmdir) (struct inode *,struct dentry *);
++      int (*rmdir2) (struct inode *, const char *, int);
+       int (*mknod) (struct inode *,struct dentry *,int,dev_t);
++      int (*mknod2) (struct inode *, const char *, int,int,int);
+       int (*rename) (struct inode *, struct dentry *,
+                       struct inode *, struct dentry *);
++      int (*rename2) (struct inode *, struct inode *,
++                      const char *oldname, int oldlen,
++                      const char *newname, int newlen);
+       int (*readlink) (struct dentry *, char *,int);
+       int (*follow_link) (struct dentry *, struct nameidata *);
++      int (*follow_link2) (struct dentry *, struct nameidata *,
++                              struct lookup_intent *it);
+       void (*truncate) (struct inode *);
+       int (*permission) (struct inode *, int);
+       int (*setattr) (struct dentry *, struct iattr *);
++      int (*setattr_raw) (struct inode *, struct iattr *);
+       int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
+       int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+       ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+@@ -987,6 +1004,7 @@ extern int register_filesystem(struct fi
  extern int unregister_filesystem(struct file_system_type *);
  extern struct vfsmount *kern_mount(struct file_system_type *);
  extern int may_umount(struct vfsmount *);
 +struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data);
  extern long do_mount(char *, char *, char *, unsigned long, void *);
  
- #define kern_umount mntput
-diff -Nru a/include/linux/namei.h b/include/linux/namei.h
---- a/include/linux/namei.h    Sun Dec  8 02:49:38 2002
-+++ b/include/linux/namei.h    Sun Dec  8 02:49:38 2002
-@@ -5,6 +5,17 @@
- struct vfsmount;
-+struct lookup_intent {
-+      int it_op;
-+      int it_mode;
-+      int it_disposition;
-+      int it_status;
-+      struct iattr *it_iattr;
-+      __u64 it_lock_handle[2];
-+      int it_lock_mode;
-+      void *it_data;
-+};
-+
- struct nameidata {
-       struct dentry   *dentry;
-       struct vfsmount *mnt;
-@@ -13,6 +24,7 @@
+ extern int vfs_statfs(struct super_block *, struct statfs *);
+--- linux-2.5.59/include/linux/namei.h~lustre-2.5      2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/include/linux/namei.h    2003-02-22 21:56:58.000000000 +0800
+@@ -13,6 +13,7 @@ struct nameidata {
        int             last_type;
        struct dentry   *old_dentry;
        struct vfsmount *old_mnt;
@@ -438,7 +436,7 @@ diff -Nru a/include/linux/namei.h b/include/linux/namei.h
  };
  
  /*
-@@ -46,7 +58,7 @@
+@@ -46,7 +47,7 @@ extern int FASTCALL(link_path_walk(const
  extern void path_release(struct nameidata *);
  
  extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
@@ -447,10 +445,9 @@ diff -Nru a/include/linux/namei.h b/include/linux/namei.h
  
  extern int follow_down(struct vfsmount **, struct dentry **);
  extern int follow_up(struct vfsmount **, struct dentry **);
-diff -Nru a/include/linux/slab.h b/include/linux/slab.h
---- a/include/linux/slab.h     Sun Dec  8 02:49:39 2002
-+++ b/include/linux/slab.h     Sun Dec  8 02:49:39 2002
-@@ -56,6 +56,7 @@
+--- linux-2.5.59/include/linux/slab.h~lustre-2.5       2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/include/linux/slab.h     2003-02-22 21:56:58.000000000 +0800
+@@ -56,6 +56,7 @@ extern int kmem_cache_destroy(kmem_cache
  extern int kmem_cache_shrink(kmem_cache_t *);
  extern void *kmem_cache_alloc(kmem_cache_t *, int);
  extern void kmem_cache_free(kmem_cache_t *, void *);
@@ -458,90 +455,46 @@ diff -Nru a/include/linux/slab.h b/include/linux/slab.h
  extern unsigned int kmem_cache_size(kmem_cache_t *);
  
  extern void *kmalloc(size_t, int);
-diff -Nru a/kernel/ksyms.c b/kernel/ksyms.c
---- a/kernel/ksyms.c   Sun Dec  8 02:49:38 2002
-+++ b/kernel/ksyms.c   Sun Dec  8 02:49:38 2002
-@@ -365,6 +365,13 @@
- EXPORT_SYMBOL(tty_get_baud_rate);
- EXPORT_SYMBOL(do_SAK);
+--- linux-2.5.59/kernel/ksyms.c~lustre-2.5     2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/kernel/ksyms.c   2003-02-22 21:56:58.000000000 +0800
+@@ -376,6 +376,7 @@ EXPORT_SYMBOL(unregister_filesystem);
+ EXPORT_SYMBOL(kern_mount);
+ EXPORT_SYMBOL(__mntput);
+ EXPORT_SYMBOL(may_umount);
++EXPORT_SYMBOL(reparent_to_init);
+ /* executable format registration */
+ EXPORT_SYMBOL(register_binfmt);
+@@ -406,6 +407,12 @@ EXPORT_SYMBOL(request_irq);
+ EXPORT_SYMBOL(free_irq);
+ EXPORT_SYMBOL(irq_stat);
  
 +/* lustre */
-+EXPORT_SYMBOL(panic_notifier_list);
-+//EXPORT_SYMBOL(pagecache_lock_cacheline);
 +EXPORT_SYMBOL(do_kern_mount);
 +EXPORT_SYMBOL(exit_files);
 +EXPORT_SYMBOL(kmem_cache_validate);
 +
- /* filesystem registration */
- EXPORT_SYMBOL(register_filesystem);
- EXPORT_SYMBOL(unregister_filesystem);
-diff -Nru a/mm/slab.c b/mm/slab.c
---- a/mm/slab.c        Sun Dec  8 02:49:39 2002
-+++ b/mm/slab.c        Sun Dec  8 02:49:39 2002
-@@ -1236,6 +1236,59 @@
-  * Called with the cache-lock held.
-  */
-+extern struct page *check_get_page(unsigned long kaddr);
-+struct page *page_mem_map(struct page *page);
-+static int kmem_check_cache_obj (kmem_cache_t * cachep,
-+                               slab_t *slabp, void * objp)
-+{
-+      int i;
-+      unsigned int objnr;
-+
-+#if DEBUG
-+      if (cachep->flags & SLAB_RED_ZONE) {
-+              objp -= BYTES_PER_WORD;
-+              if ( *(unsigned long *)objp != RED_MAGIC2)
-+                      /* Either write before start, or a double free. */
-+                      return 0;
-+              if (*(unsigned long *)(objp+cachep->objsize -
-+                              BYTES_PER_WORD) != RED_MAGIC2)
-+                      /* Either write past end, or a double free. */
-+                      return 0;
-+      }
-+#endif
-+
-+      objnr = (objp-slabp->s_mem)/cachep->objsize;
-+      if (objnr >= cachep->num)
-+              return 0;
-+      if (objp != slabp->s_mem + objnr*cachep->objsize)
-+              return 0;
-+
-+      /* Check slab's freelist to see if this obj is there. */
-+      for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
-+              if (i == objnr)
-+                      return 0;
-+      }
-+      return 1;
-+}
-+
 +
+ /* waitqueue handling */
+ EXPORT_SYMBOL(add_wait_queue);
+ EXPORT_SYMBOL(add_wait_queue_exclusive);
+--- linux-2.5.59/mm/slab.c~lustre-2.5  2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/mm/slab.c        2003-02-22 21:56:58.000000000 +0800
+@@ -1793,6 +1793,11 @@ static inline void __cache_free (kmem_ca
+       }
+ }
 +int kmem_cache_validate(kmem_cache_t *cachep, void *objp)
 +{
-+      struct page *page = check_get_page((unsigned long)objp);
-+
-+      if (!page_mem_map(page))
-+              return 0;
-+
-+      if (!PageSlab(page))
-+              return 0;
-+
-+      /* XXX check for freed slab objects ? */
-+      if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp))
-+              return 0;
-+
-+      return (cachep == GET_PAGE_CACHE(page));
++      return 1;
 +}
 +
- #if DEBUG
- static int kmem_extra_free_checks (kmem_cache_t * cachep,
-                       slab_t *slabp, void * objp)
-diff -Nru a/net/unix/af_unix.c b/net/unix/af_unix.c
---- a/net/unix/af_unix.c       Sun Dec  8 02:49:38 2002
-+++ b/net/unix/af_unix.c       Sun Dec  8 02:49:38 2002
-@@ -715,7 +715,7 @@
+ /**
+  * kmem_cache_alloc - Allocate an object
+  * @cachep: The cache to allocate from.
+--- linux-2.5.59/net/unix/af_unix.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800
++++ linux-2.5.59-root/net/unix/af_unix.c       2003-02-22 21:56:58.000000000 +0800
+@@ -719,7 +719,7 @@ static int unix_bind(struct socket *sock
                /*
                 * Do the final lookup.
                 */
@@ -550,3 +503,5 @@ diff -Nru a/net/unix/af_unix.c b/net/unix/af_unix.c
                err = PTR_ERR(dentry);
                if (IS_ERR(dentry))
                        goto out_mknod_unlock;
+
+_
index 9ed43cf..d7b6dce 100644 (file)
@@ -5,8 +5,8 @@
  1 files changed, 1 insertion(+)
 
 --- /dev/null  Fri Aug 30 17:31:37 2002
-+++ linux-2.4.18-18.8.0-l7-root/include/linux/lustre_version.h Mon Jan 20 12:24:45 2003
++++ linux-2.4.18-18.8.0-l12-braam/include/linux/lustre_version.h       Thu Feb 13 07:58:33 2003
 @@ -0,0 +1 @@
-+#define LUSTRE_KERNEL_VERSION 10
++#define LUSTRE_KERNEL_VERSION 13
 
 _
index 7384675..5c1f090 100644 (file)
@@ -1,15 +1,16 @@
- fs/dcache.c            |    8 +
- fs/namei.c             |  288 ++++++++++++++++++++++++++++++++++++++++---------
+ fs/dcache.c            |   20 ++
+ fs/exec.c              |   18 +-
+ fs/namei.c             |  338 ++++++++++++++++++++++++++++++++++++++++---------
  fs/nfsd/vfs.c          |    2 
- fs/open.c              |   53 +++++++--
- fs/stat.c              |    9 +
- include/linux/dcache.h |   25 ++++
- include/linux/fs.h     |   22 +++
+ fs/open.c              |  120 +++++++++++++++--
+ fs/stat.c              |    8 -
+ include/linux/dcache.h |   28 ++++
+ include/linux/fs.h     |   27 +++
  kernel/ksyms.c         |    1 
8 files changed, 345 insertions(+), 63 deletions(-)
9 files changed, 478 insertions(+), 84 deletions(-)
 
---- linux-2.4.18-49chaos-lustre9/fs/dcache.c~vfs_intent-2.4.18-18      Wed Jan 29 12:43:32 2003
-+++ linux-2.4.18-49chaos-lustre9-root/fs/dcache.c      Wed Jan 29 12:43:32 2003
+--- linux-2.4.18-18.8.0-l12/fs/dcache.c~vfs_intent-2.4.18-18   Wed Feb 26 16:54:17 2003
++++ linux-2.4.18-18.8.0-l12-phil/fs/dcache.c   Wed Feb 26 17:31:36 2003
 @@ -186,6 +186,13 @@ int d_invalidate(struct dentry * dentry)
                spin_unlock(&dcache_lock);
                return 0;
        INIT_LIST_HEAD(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
---- linux-2.4.18-49chaos-lustre9/fs/namei.c~vfs_intent-2.4.18-18       Wed Jan 29 12:43:32 2003
-+++ linux-2.4.18-49chaos-lustre9-root/fs/namei.c       Wed Feb  5 16:23:06 2003
+@@ -859,13 +867,19 @@ void d_delete(struct dentry * dentry)
+  * Adds a dentry to the hash according to its name.
+  */
+  
+-void d_rehash(struct dentry * entry)
++void __d_rehash(struct dentry * entry, int lock)
+ {
+       struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+       if (!list_empty(&entry->d_hash)) BUG();
+-      spin_lock(&dcache_lock);
++      if (lock) spin_lock(&dcache_lock);
+       list_add(&entry->d_hash, list);
+-      spin_unlock(&dcache_lock);
++      if (lock) spin_unlock(&dcache_lock);
++}
++EXPORT_SYMBOL(__d_rehash);
++
++void d_rehash(struct dentry * entry)
++{
++      __d_rehash(entry, 1);
+ }
+ #define do_switch(x,y) do { \
+--- linux-2.4.18-18.8.0-l12/fs/namei.c~vfs_intent-2.4.18-18    Wed Feb 26 16:54:17 2003
++++ linux-2.4.18-18.8.0-l12-phil/fs/namei.c    Wed Feb 26 16:54:17 2003
 @@ -94,6 +94,13 @@
   * XEmacs seems to be relying on it...
   */
        path_release(nd);
        return -ELOOP;
  }
-@@ -449,7 +482,8 @@ static inline void follow_dotdot(struct 
+@@ -381,15 +414,26 @@ int follow_up(struct vfsmount **mnt, str
+       return __follow_up(mnt, dentry);
+ }
+-static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
++static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry,
++                              struct lookup_intent *it)
+ {
+       struct vfsmount *mounted;
+       spin_lock(&dcache_lock);
+       mounted = lookup_mnt(*mnt, *dentry);
+       if (mounted) {
++              int opc = 0, mode = 0;
+               *mnt = mntget(mounted);
+               spin_unlock(&dcache_lock);
++              if (it) {
++                      opc = it->it_op;
++                      mode = it->it_mode;
++              }
++              intent_release(*dentry, it);
++              if (it) {
++                      it->it_op = opc;
++                      it->it_mode = mode;
++              }
+               dput(*dentry);
+               mntput(mounted->mnt_parent);
+               *dentry = dget(mounted->mnt_root);
+@@ -401,7 +445,7 @@ static inline int __follow_down(struct v
+ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
+ {
+-      return __follow_down(mnt,dentry);
++      return __follow_down(mnt,dentry,NULL);
+ }
+  
+ static inline void follow_dotdot(struct nameidata *nd)
+@@ -437,7 +481,7 @@ static inline void follow_dotdot(struct 
+               mntput(nd->mnt);
+               nd->mnt = parent;
+       }
+-      while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry))
++      while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry, NULL))
+               ;
+ }
+@@ -449,7 +493,8 @@ static inline void follow_dotdot(struct 
   *
   * We expect 'base' to be positive and a directory.
   */
  {
        struct dentry *dentry;
        struct inode *inode;
-@@ -526,12 +560,12 @@ int link_path_walk(const char * name, st
+@@ -526,18 +571,18 @@ int link_path_walk(const char * name, st
                                break;
                }
                /* This does the actual lookups.. */
                        err = PTR_ERR(dentry);
                        if (IS_ERR(dentry))
                                break;
-@@ -548,8 +582,8 @@ int link_path_walk(const char * name, st
+               }
+               /* Check mountpoints.. */
+-              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry))
++              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, NULL))
+                       ;
+               err = -ENOENT;
+@@ -548,8 +593,8 @@ int link_path_walk(const char * name, st
                if (!inode->i_op)
                        goto out_dput;
  
                        dput(dentry);
                        if (err)
                                goto return_err;
-@@ -565,7 +599,7 @@ int link_path_walk(const char * name, st
+@@ -565,7 +610,7 @@ int link_path_walk(const char * name, st
                        nd->dentry = dentry;
                }
                err = -ENOTDIR; 
                        break;
                continue;
                /* here ends the main loop */
-@@ -592,12 +626,12 @@ last_component:
+@@ -592,22 +637,23 @@ last_component:
                        if (err < 0)
                                break;
                }
                        err = PTR_ERR(dentry);
                        if (IS_ERR(dentry))
                                break;
-@@ -606,8 +640,9 @@ last_component:
+               }
+-              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry))
++              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, it))
                        ;
                inode = dentry->d_inode;
                if ((lookup_flags & LOOKUP_FOLLOW)
                        dput(dentry);
                        if (err)
                                goto return_err;
-@@ -621,7 +656,8 @@ last_component:
+@@ -621,7 +667,8 @@ last_component:
                        goto no_inode;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
                                break;
                }
                goto return_base;
-@@ -658,15 +694,28 @@ out_dput:
+@@ -658,15 +705,28 @@ out_dput:
                dput(dentry);
                break;
        }
  }
  
  /* SMP-safe */
-@@ -751,6 +800,17 @@ walk_init_root(const char *name, struct 
+@@ -751,6 +811,17 @@ walk_init_root(const char *name, struct 
  }
  
  /* SMP-safe */
  int path_lookup(const char *path, unsigned flags, struct nameidata *nd)
  {
        int error = 0;
-@@ -779,7 +839,8 @@ int path_init(const char *name, unsigned
+@@ -779,7 +850,8 @@ int path_init(const char *name, unsigned
   * needs parent already locked. Doesn't follow mounts.
   * SMP-safe.
   */
  {
        struct dentry * dentry;
        struct inode *inode;
-@@ -802,13 +863,16 @@ struct dentry * lookup_hash(struct qstr 
+@@ -802,13 +874,16 @@ struct dentry * lookup_hash(struct qstr 
                        goto out;
        }
  
                dentry = inode->i_op->lookup(inode, new);
                unlock_kernel();
                if (!dentry)
-@@ -820,6 +884,12 @@ out:
+@@ -820,6 +895,12 @@ out:
        return dentry;
  }
  
  /* SMP-safe */
  struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
  {
-@@ -841,7 +911,7 @@ struct dentry * lookup_one_len(const cha
+@@ -841,7 +922,7 @@ struct dentry * lookup_one_len(const cha
        }
        this.hash = end_name_hash(hash);
  
  access:
        return ERR_PTR(-EACCES);
  }
-@@ -872,6 +942,23 @@ int __user_walk(const char *name, unsign
+@@ -872,6 +953,23 @@ int __user_walk(const char *name, unsign
        return err;
  }
  
  /*
   * It's inline, so penalty for filesystems that don't use sticky bit is
   * minimal.
-@@ -1045,14 +1132,17 @@ int may_open(struct nameidata *nd, int a
+@@ -1045,14 +1143,17 @@ int may_open(struct nameidata *nd, int a
          return get_lease(inode, flag);
  }
  
        int count = 0;
  
        if ((flag+1) & O_ACCMODE)
-@@ -1066,7 +1156,7 @@ struct file *filp_open(const char * path
+@@ -1066,7 +1167,7 @@ struct file *filp_open(const char * path
         * The simplest case - just a plain lookup.
         */
        if (!(flag & O_CREAT)) {
                if (error)
                        return ERR_PTR(error);
                dentry = nd.dentry;
-@@ -1076,6 +1166,8 @@ struct file *filp_open(const char * path
+@@ -1076,6 +1177,8 @@ struct file *filp_open(const char * path
        /*
         * Create - we need to know the parent.
         */
        error = path_lookup(pathname, LOOKUP_PARENT, &nd);
        if (error)
                return ERR_PTR(error);
-@@ -1091,7 +1183,7 @@ struct file *filp_open(const char * path
+@@ -1091,7 +1194,7 @@ struct file *filp_open(const char * path
  
        dir = nd.dentry;
        down(&dir->d_inode->i_sem);
  
  do_last:
        error = PTR_ERR(dentry);
-@@ -1100,6 +1192,7 @@ do_last:
+@@ -1100,6 +1203,7 @@ do_last:
                goto exit;
        }
  
        /* Negative dentry, just create the file */
        if (!dentry->d_inode) {
                error = vfs_create(dir->d_inode, dentry,
-@@ -1134,7 +1227,8 @@ do_last:
+@@ -1129,12 +1233,13 @@ do_last:
+               error = -ELOOP;
+               if (flag & O_NOFOLLOW)
+                       goto exit_dput;
+-              while (__follow_down(&nd.mnt,&dentry) && d_mountpoint(dentry));
++              while (__follow_down(&nd.mnt,&dentry,&it) && d_mountpoint(dentry));
+       }
        error = -ENOENT;
        if (!dentry->d_inode)
                goto exit_dput;
                goto do_link;
  
        dput(nd.dentry);
-@@ -1149,11 +1243,13 @@ ok:
+@@ -1149,11 +1254,13 @@ ok:
        if (!S_ISREG(nd.dentry->d_inode->i_mode))
                open_flags &= ~O_TRUNC;
  
        path_release(&nd);
        return ERR_PTR(error);
  
-@@ -1172,7 +1268,12 @@ do_link:
+@@ -1172,10 +1279,15 @@ do_link:
         * are done. Procfs-like symlinks just set LAST_BIND.
         */
        UPDATE_ATIME(dentry->d_inode);
 +              intent_release(dentry, &it);
        dput(dentry);
        if (error)
-               return error;
-@@ -1194,13 +1295,15 @@ do_link:
+-              return error;
++              return ERR_PTR(error);
+       if (nd.last_type == LAST_BIND) {
+               dentry = nd.dentry;
+               goto ok;
+@@ -1194,13 +1306,15 @@ do_link:
        }
        dir = nd.dentry;
        down(&dir->d_inode->i_sem);
  {
        struct dentry *dentry;
  
-@@ -1208,7 +1311,7 @@ static struct dentry *lookup_create(stru
+@@ -1208,7 +1322,7 @@ static struct dentry *lookup_create(stru
        dentry = ERR_PTR(-EEXIST);
        if (nd->last_type != LAST_NORM)
                goto fail;
        if (IS_ERR(dentry))
                goto fail;
        if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1264,7 +1367,19 @@ asmlinkage long sys_mknod(const char * f
+@@ -1264,7 +1378,19 @@ asmlinkage long sys_mknod(const char * f
        error = path_lookup(tmp, LOOKUP_PARENT, &nd);
        if (error)
                goto out;
 +                                 nd.last.name,
 +                                 nd.last.len,
 +                                 mode, dev);
-+              /* the file system want to use normal vfs path now */
++              /* the file system wants to use normal vfs path now */
 +              if (error != -EOPNOTSUPP)
 +                      goto out2;
 +      }
        error = PTR_ERR(dentry);
  
        mode &= ~current->fs->umask;
-@@ -1285,6 +1400,7 @@ asmlinkage long sys_mknod(const char * f
+@@ -1285,6 +1411,7 @@ asmlinkage long sys_mknod(const char * f
                dput(dentry);
        }
        up(&nd.dentry->d_inode->i_sem);
        path_release(&nd);
  out:
        putname(tmp);
-@@ -1332,7 +1448,17 @@ asmlinkage long sys_mkdir(const char * p
+@@ -1332,7 +1459,17 @@ asmlinkage long sys_mkdir(const char * p
                error = path_lookup(tmp, LOOKUP_PARENT, &nd);
                if (error)
                        goto out;
 +                                         nd.last.name,
 +                                         nd.last.len,
 +                                         mode);
-+                      /* the file system want to use normal vfs path now */
++                      /* the file system wants to use normal vfs path now */
 +                      if (error != -EOPNOTSUPP)
 +                              goto out2;
 +              }
                error = PTR_ERR(dentry);
                if (!IS_ERR(dentry)) {
                        error = vfs_mkdir(nd.dentry->d_inode, dentry,
-@@ -1340,6 +1466,7 @@ asmlinkage long sys_mkdir(const char * p
+@@ -1340,6 +1477,7 @@ asmlinkage long sys_mkdir(const char * p
                        dput(dentry);
                }
                up(&nd.dentry->d_inode->i_sem);
                path_release(&nd);
  out:
                putname(tmp);
-@@ -1440,8 +1567,17 @@ asmlinkage long sys_rmdir(const char * p
+@@ -1440,8 +1578,33 @@ asmlinkage long sys_rmdir(const char * p
                        error = -EBUSY;
                        goto exit1;
        }
 +      if (nd.dentry->d_inode->i_op->rmdir2) {
 +              struct inode_operations *op = nd.dentry->d_inode->i_op;
++              struct dentry *last;
++
++              down(&nd.dentry->d_inode->i_sem);
++              last = lookup_hash_it(&nd.last, nd.dentry, NULL);
++              up(&nd.dentry->d_inode->i_sem);
++              if (IS_ERR(last)) {
++                      error = PTR_ERR(last);
++                      goto exit1;
++              }
++              if (d_mountpoint(last)) {
++                      dput(last);
++                      error = -EBUSY;
++                      goto exit1;
++              }
++              dput(last);
++
 +              error = op->rmdir2(nd.dentry->d_inode,
 +                                 nd.last.name,
 +                                 nd.last.len);
-+              /* the file system want to use normal vfs path now */
++              /* the file system wants to use normal vfs path now */
 +              if (error != -EOPNOTSUPP)
 +                      goto exit1;
 +      }
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                error = vfs_rmdir(nd.dentry->d_inode, dentry);
-@@ -1499,8 +1635,17 @@ asmlinkage long sys_unlink(const char * 
+@@ -1499,8 +1662,17 @@ asmlinkage long sys_unlink(const char * 
        error = -EISDIR;
        if (nd.last_type != LAST_NORM)
                goto exit1;
 +              error = op->unlink2(nd.dentry->d_inode,
 +                                  nd.last.name,
 +                                  nd.last.len);
-+              /* the file system want to use normal vfs path now */
++              /* the file system wants to use normal vfs path now */
 +              if (error != -EOPNOTSUPP)
 +                      goto exit1;
 +      }
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                /* Why not before? Because we want correct error value */
-@@ -1567,15 +1712,26 @@ asmlinkage long sys_symlink(const char *
+@@ -1567,15 +1739,26 @@ asmlinkage long sys_symlink(const char *
                error = path_lookup(to, LOOKUP_PARENT, &nd);
                if (error)
                        goto out;
 +                                           nd.last.name,
 +                                           nd.last.len,
 +                                           from);
-+                      /* the file system want to use normal vfs path now */
++                      /* the file system wants to use normal vfs path now */
 +                      if (error != -EOPNOTSUPP)
 +                              goto out2;
 +              }
                putname(to);
        }
        putname(from);
-@@ -1642,7 +1798,7 @@ asmlinkage long sys_link(const char * ol
+@@ -1642,7 +1825,7 @@ asmlinkage long sys_link(const char * ol
                struct dentry *new_dentry;
                struct nameidata nd, old_nd;
  
                if (error)
                        goto exit;
                error = path_lookup(to, LOOKUP_PARENT, &nd);
-@@ -1651,7 +1807,17 @@ asmlinkage long sys_link(const char * ol
+@@ -1651,7 +1834,17 @@ asmlinkage long sys_link(const char * ol
                error = -EXDEV;
                if (old_nd.mnt != nd.mnt)
                        goto out_release;
 +                                        nd.dentry->d_inode,
 +                                        nd.last.name,
 +                                        nd.last.len);
-+                      /* the file system want to use normal vfs path now */
++                      /* the file system wants to use normal vfs path now */
 +                      if (error != -EOPNOTSUPP)
 +                              goto out_release;
 +              }
                error = PTR_ERR(new_dentry);
                if (!IS_ERR(new_dentry)) {
                        error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
-@@ -1695,7 +1861,8 @@ exit:
+@@ -1695,7 +1888,8 @@ exit:
   *       locking].
   */
  int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
  {
        int error;
        struct inode *target;
-@@ -1753,6 +1920,7 @@ int vfs_rename_dir(struct inode *old_dir
+@@ -1753,6 +1947,7 @@ int vfs_rename_dir(struct inode *old_dir
                error = -EBUSY;
        else 
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        if (target) {
                if (!error)
                        target->i_flags |= S_DEAD;
-@@ -1774,7 +1942,8 @@ out_unlock:
+@@ -1774,7 +1969,8 @@ out_unlock:
  }
  
  int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
  {
        int error;
  
-@@ -1805,6 +1974,7 @@ int vfs_rename_other(struct inode *old_d
+@@ -1805,6 +2001,7 @@ int vfs_rename_other(struct inode *old_d
                error = -EBUSY;
        else
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        double_up(&old_dir->i_zombie, &new_dir->i_zombie);
        if (error)
                return error;
-@@ -1816,13 +1986,14 @@ int vfs_rename_other(struct inode *old_d
+@@ -1816,13 +2013,14 @@ int vfs_rename_other(struct inode *old_d
  }
  
  int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (!error) {
                if (old_dir == new_dir)
                        inode_dir_notify(old_dir, DN_RENAME);
-@@ -1862,9 +2033,23 @@ static inline int do_rename(const char *
-       if (newnd.last_type != LAST_NORM)
-               goto exit2;
+@@ -1864,7 +2062,7 @@ static inline int do_rename(const char *
  
-+      if (old_dir->d_inode->i_op->rename2) {
-+              lock_kernel();
-+              error = old_dir->d_inode->i_op->rename2(old_dir->d_inode,
-+                                                      new_dir->d_inode,
-+                                                      oldnd.last.name,
-+                                                      oldnd.last.len,
-+                                                      newnd.last.name,
-+                                                      newnd.last.len);
-+              unlock_kernel();
-+              /* the file system want to use normal vfs path now */
-+              if (error != -EOPNOTSUPP)
-+                      goto exit2;
-+      }
-+
        double_lock(new_dir, old_dir);
  
 -      old_dentry = lookup_hash(&oldnd.last, old_dir);
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
-@@ -1880,14 +2065,14 @@ static inline int do_rename(const char *
+@@ -1880,16 +2078,37 @@ static inline int do_rename(const char *
                if (newnd.last.name[newnd.last.len])
                        goto exit4;
        }
        if (IS_ERR(new_dentry))
                goto exit4;
  
++      if (old_dir->d_inode->i_op->rename2) {
++              lock_kernel();
++              /* don't rename mount point. mds will take care of
++               * the rest sanity checking */
++              if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) {
++                      error = -EBUSY;
++                      goto exit5;
++              }
++
++              error = old_dir->d_inode->i_op->rename2(old_dir->d_inode,
++                                                      new_dir->d_inode,
++                                                      oldnd.last.name,
++                                                      oldnd.last.len,
++                                                      newnd.last.name,
++                                                      newnd.last.len);
++              unlock_kernel();
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto exit5;
++      }
++
        lock_kernel();
        error = vfs_rename(old_dir->d_inode, old_dentry,
 -                                 new_dir->d_inode, new_dentry);
 +                                 new_dir->d_inode, new_dentry, NULL);
        unlock_kernel();
+-
++exit5:
        dput(new_dentry);
-@@ -1940,7 +2125,8 @@ out:
+ exit4:
+       dput(old_dentry);
+@@ -1940,7 +2159,8 @@ out:
  }
  
  static inline int
  {
        int res = 0;
        char *name;
-@@ -1953,7 +2139,7 @@ __vfs_follow_link(struct nameidata *nd, 
+@@ -1953,7 +2173,7 @@ __vfs_follow_link(struct nameidata *nd, 
                        /* weird __emul_prefix() stuff did it */
                        goto out;
        }
  out:
        if (current->link_count || res || nd->last_type!=LAST_NORM)
                return res;
-@@ -1975,7 +2161,13 @@ fail:
+@@ -1975,7 +2195,13 @@ fail:
  
  int vfs_follow_link(struct nameidata *nd, const char *link)
  {
  }
  
  /* get the link contents into pagecache */
-@@ -2017,7 +2209,7 @@ int page_follow_link(struct dentry *dent
+@@ -2017,7 +2243,7 @@ int page_follow_link(struct dentry *dent
  {
        struct page *page = NULL;
        char *s = page_getlink(dentry, &page);
        if (page) {
                kunmap(page);
                page_cache_release(page);
---- linux-2.4.18-49chaos-lustre9/fs/nfsd/vfs.c~vfs_intent-2.4.18-18    Wed Jan 29 12:43:32 2003
-+++ linux-2.4.18-49chaos-lustre9-root/fs/nfsd/vfs.c    Wed Jan 29 12:43:32 2003
+--- linux-2.4.18-18.8.0-l12/fs/nfsd/vfs.c~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003
++++ linux-2.4.18-18.8.0-l12-phil/fs/nfsd/vfs.c Wed Feb 26 16:54:17 2003
 @@ -1298,7 +1298,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
                        err = nfserr_perm;
        } else
        unlock_kernel();
        if (!err && EX_ISSYNC(tfhp->fh_export)) {
                nfsd_sync_dir(tdentry);
---- linux-2.4.18-49chaos-lustre9/fs/open.c~vfs_intent-2.4.18-18        Wed Jan 29 12:43:32 2003
-+++ linux-2.4.18-49chaos-lustre9-root/fs/open.c        Wed Jan 29 12:43:32 2003
-@@ -19,6 +19,9 @@
+--- linux-2.4.18-18.8.0-l12/fs/open.c~vfs_intent-2.4.18-18     Wed Feb 26 16:54:17 2003
++++ linux-2.4.18-18.8.0-l12-phil/fs/open.c     Wed Feb 26 16:54:17 2003
+@@ -19,6 +19,8 @@
  #include <asm/uaccess.h>
  
  #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
 +extern int path_walk_it(const char *name, struct nameidata *nd,
 +                      struct lookup_intent *it);
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
  
  int vfs_statfs(struct super_block *sb, struct statfs *buf)
  {
-@@ -118,12 +121,13 @@ static inline long do_sys_truncate(const
+@@ -95,9 +97,10 @@ void fd_install(unsigned int fd, struct 
+       write_unlock(&files->file_lock);
+ }
+-int do_truncate(struct dentry *dentry, loff_t length)
++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open)
+ {
+       struct inode *inode = dentry->d_inode;
++      struct inode_operations *op = dentry->d_inode->i_op;
+       int error;
+       struct iattr newattrs;
+@@ -108,7 +111,14 @@ int do_truncate(struct dentry *dentry, l
+       down(&inode->i_sem);
+       newattrs.ia_size = length;
+       newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+-      error = notify_change(dentry, &newattrs);
++      if (called_from_open)
++              newattrs.ia_valid |= ATTR_FROM_OPEN;
++      if (op->setattr_raw) {
++              newattrs.ia_valid |= ATTR_RAW;
++              newattrs.ia_ctime = CURRENT_TIME;
++              error = op->setattr_raw(inode, &newattrs);
++      } else 
++              error = notify_change(dentry, &newattrs);
+       up(&inode->i_sem);
+       return error;
+ }
+@@ -118,12 +128,13 @@ static inline long do_sys_truncate(const
        struct nameidata nd;
        struct inode * inode;
        int error;
-+      struct lookup_intent it = { .it_op = IT_TRUNC };
++      struct lookup_intent it = { .it_op = IT_GETATTR };
  
        error = -EINVAL;
        if (length < 0) /* sorry, but loff_t says... */
        if (error)
                goto out;
        inode = nd.dentry->d_inode;
-@@ -168,6 +172,7 @@ static inline long do_sys_truncate(const
+@@ -163,11 +174,13 @@ static inline long do_sys_truncate(const
+       error = locks_verify_truncate(inode, NULL, length);
+       if (!error) {
+               DQUOT_INIT(inode);
+-              error = do_truncate(nd.dentry, length);
++              intent_release(nd.dentry, &it);
++              error = do_truncate(nd.dentry, length, 0);
+       }
        put_write_access(inode);
  
  dput_and_out:
        path_release(&nd);
  out:
        return error;
-@@ -259,8 +264,9 @@ asmlinkage long sys_utime(char * filenam
-       struct nameidata nd;
+@@ -215,7 +228,7 @@ static inline long do_sys_ftruncate(unsi
+       error = locks_verify_truncate(inode, file, length);
+       if (!error)
+-              error = do_truncate(dentry, length);
++              error = do_truncate(dentry, length, 0);
+ out_putf:
+       fput(file);
+ out:
+@@ -260,11 +273,13 @@ asmlinkage long sys_utime(char * filenam
        struct inode * inode;
        struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
  
 -      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
++      error = user_path_walk_it(filename, &nd, NULL);
        if (error)
                goto out;
        inode = nd.dentry->d_inode;
-@@ -286,6 +292,7 @@ asmlinkage long sys_utime(char * filenam
++      /* this is safe without a Lustre lock because it only depends
++         on the super block */
+       error = -EROFS;
+       if (IS_RDONLY(inode))
+               goto dput_and_out;
+@@ -279,11 +294,29 @@ asmlinkage long sys_utime(char * filenam
+                       goto dput_and_out;
+               newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+-      } else {
++      }
++
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto dput_and_out;
++      }
++
++      error = -EROFS;
++      if (IS_RDONLY(inode))
++              goto dput_and_out;
++
++      error = -EPERM;
++      if (!times) {
+               if (current->fsuid != inode->i_uid &&
+                   (error = permission(inode,MAY_WRITE)) != 0)
+                       goto dput_and_out;
        }
++
        error = notify_change(nd.dentry, &newattrs);
  dput_and_out:
-+      intent_release(nd.dentry, &it);
        path_release(&nd);
- out:
-       return error;
-@@ -303,8 +310,9 @@ asmlinkage long sys_utimes(char * filena
-       struct nameidata nd;
+@@ -304,12 +337,14 @@ asmlinkage long sys_utimes(char * filena
        struct inode * inode;
        struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
  
 -      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
++      error = user_path_walk_it(filename, &nd, NULL);
  
        if (error)
                goto out;
-@@ -331,6 +339,7 @@ asmlinkage long sys_utimes(char * filena
-       }
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -347,6 +356,7 @@ asmlinkage long sys_access(const char * 
+       inode = nd.dentry->d_inode;
++      /* this is safe without a Lustre lock because it only depends
++         on the super block */
+       error = -EROFS;
+       if (IS_RDONLY(inode))
+               goto dput_and_out;
+@@ -324,7 +359,20 @@ asmlinkage long sys_utimes(char * filena
+               newattrs.ia_atime = times[0].tv_sec;
+               newattrs.ia_mtime = times[1].tv_sec;
+               newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+-      } else {
++      }
++
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto dput_and_out;
++      }
++
++      error = -EPERM;
++      if (!utimes) {
+               if (current->fsuid != inode->i_uid &&
+                   (error = permission(inode,MAY_WRITE)) != 0)
+                       goto dput_and_out;
+@@ -347,6 +395,7 @@ asmlinkage long sys_access(const char * 
        int old_fsuid, old_fsgid;
        kernel_cap_t old_cap;
        int res;
  
        if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;
-@@ -364,13 +374,14 @@ asmlinkage long sys_access(const char * 
+@@ -364,13 +413,14 @@ asmlinkage long sys_access(const char * 
        else
                current->cap_effective = current->cap_permitted;
  
                path_release(&nd);
        }
  
-@@ -385,8 +396,11 @@ asmlinkage long sys_chdir(const char * f
+@@ -385,8 +435,11 @@ asmlinkage long sys_chdir(const char * f
  {
        int error;
        struct nameidata nd;
        if (error)
                goto out;
  
-@@ -397,6 +411,7 @@ asmlinkage long sys_chdir(const char * f
+@@ -397,6 +450,7 @@ asmlinkage long sys_chdir(const char * f
        set_fs_pwd(current->fs, nd.mnt, nd.dentry);
  
  dput_and_out:
        path_release(&nd);
  out:
        return error;
-@@ -436,9 +451,10 @@ asmlinkage long sys_chroot(const char * 
+@@ -436,9 +490,10 @@ asmlinkage long sys_chroot(const char * 
  {
        int error;
        struct nameidata nd;
        if (error)
                goto out;
  
-@@ -454,6 +470,7 @@ asmlinkage long sys_chroot(const char * 
+@@ -454,6 +509,7 @@ asmlinkage long sys_chroot(const char * 
        set_fs_altroot();
        error = 0;
  dput_and_out:
        path_release(&nd);
  out:
        return error;
-@@ -498,8 +515,9 @@ asmlinkage long sys_chmod(const char * f
-       struct inode * inode;
-       int error;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
+@@ -508,6 +564,18 @@ asmlinkage long sys_chmod(const char * f
+       if (IS_RDONLY(inode))
+               goto dput_and_out;
  
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++              newattrs.ia_mode = mode;
++              newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto dput_and_out;
++      }
++
+       error = -EPERM;
+       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+               goto dput_and_out;
+@@ -538,6 +606,20 @@ static int chown_common(struct dentry * 
+       error = -EROFS;
+       if (IS_RDONLY(inode))
                goto out;
-       inode = nd.dentry->d_inode;
-@@ -519,6 +537,7 @@ asmlinkage long sys_chmod(const char * f
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -588,10 +607,12 @@ asmlinkage long sys_chown(const char * f
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -601,10 +622,12 @@ asmlinkage long sys_lchown(const char * 
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -628,7 +651,8 @@ extern ssize_t do_readahead(struct file 
++
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = dentry->d_inode->i_op;
++
++              newattrs.ia_uid = user;
++              newattrs.ia_gid = group;
++              newattrs.ia_valid = ATTR_UID | ATTR_GID;
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      return error;
++      }
++
+       error = -EPERM;
+       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+               goto out;
+@@ -628,7 +710,8 @@ extern ssize_t do_readahead(struct file 
  /* for files over a certains size it doesn't pay to do readahead on open */
  #define READAHEAD_CUTOFF 48000
  
  {
        struct file * f;
        struct inode *inode;
-@@ -693,6 +717,7 @@ struct file *dentry_open(struct dentry *
+@@ -649,7 +732,7 @@ struct file *dentry_open(struct dentry *
+               error = locks_verify_locked(inode);
+               if (!error) {
+                       DQUOT_INIT(inode);
+-                      error = do_truncate(dentry, 0);
++                      error = do_truncate(dentry, 0, 1);
+               }
+               if (error || !(f->f_mode & FMODE_WRITE))
+                       put_write_access(inode);
+@@ -693,6 +776,7 @@ struct file *dentry_open(struct dentry *
                do_readahead(f, 0, (48 * 1024) >> PAGE_SHIFT);
        
  
        return f;
  
  cleanup_all:
-@@ -707,11 +732,17 @@ cleanup_all:
+@@ -707,11 +791,17 @@ cleanup_all:
  cleanup_file:
        put_filp(f);
  cleanup_dentry:
  /*
   * Find an empty file descriptor entry, and mark it busy.
   */
---- linux-2.4.18-49chaos-lustre9/fs/stat.c~vfs_intent-2.4.18-18        Wed Jan 29 12:43:32 2003
-+++ linux-2.4.18-49chaos-lustre9-root/fs/stat.c        Wed Jan 29 12:43:32 2003
-@@ -13,6 +13,7 @@
- #include <asm/uaccess.h>
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
- /*
-  * Revalidate the inode. This is required for proper NFS attribute caching.
-  */
-@@ -104,10 +105,12 @@ int vfs_stat(char *name, struct kstat *s
+--- linux-2.4.18-18.8.0-l12/fs/stat.c~vfs_intent-2.4.18-18     Wed Feb 26 16:54:17 2003
++++ linux-2.4.18-18.8.0-l12-phil/fs/stat.c     Wed Feb 26 16:54:17 2003
+@@ -104,10 +104,12 @@ int vfs_stat(char *name, struct kstat *s
  {
        struct nameidata nd;
        int error;
                path_release(&nd);
        }
        return error;
-@@ -117,10 +120,12 @@ int vfs_lstat(char *name, struct kstat *
+@@ -117,10 +119,12 @@ int vfs_lstat(char *name, struct kstat *
  {
        struct nameidata nd;
        int error;
                path_release(&nd);
        }
        return error;
---- linux-2.4.18-49chaos-lustre9/include/linux/dcache.h~vfs_intent-2.4.18-18   Wed Jan 29 12:43:32 2003
-+++ linux-2.4.18-49chaos-lustre9-root/include/linux/dcache.h   Wed Jan 29 12:43:32 2003
-@@ -6,6 +6,27 @@
+--- linux-2.4.18-18.8.0-l12/fs/exec.c~vfs_intent-2.4.18-18     Wed Feb 26 16:54:17 2003
++++ linux-2.4.18-18.8.0-l12-phil/fs/exec.c     Wed Feb 26 16:54:17 2003
+@@ -103,13 +103,18 @@ static inline void put_binfmt(struct lin
+  *
+  * Also note that we take the address to load from from the file itself.
+  */
++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
++                          int flags, struct lookup_intent *it);
++int path_lookup_it(const char *path, unsigned flags, struct nameidata *nd,
++                 struct lookup_intent *it);
+ asmlinkage long sys_uselib(const char * library)
+ {
+       struct file * file;
+       struct nameidata nd;
+       int error;
++      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY };
+-      error = user_path_walk(library, &nd);
++      error = user_path_walk_it(library, &nd, &it);
+       if (error)
+               goto out;
+@@ -121,7 +126,8 @@ asmlinkage long sys_uselib(const char * 
+       if (error)
+               goto exit;
+-      file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++      file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
++      intent_release(nd.dentry, &it);
+       error = PTR_ERR(file);
+       if (IS_ERR(file))
+               goto out;
+@@ -350,8 +356,9 @@ struct file *open_exec(const char *name)
+       struct inode *inode;
+       struct file *file;
+       int err = 0;
++      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY };
+-      err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd);
++      err = path_lookup_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it);
+       file = ERR_PTR(err);
+       if (!err) {
+               inode = nd.dentry->d_inode;
+@@ -363,7 +370,8 @@ struct file *open_exec(const char *name)
+                               err = -EACCES;
+                       file = ERR_PTR(err);
+                       if (!err) {
+-                              file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++                              file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
++                              intent_release(nd.dentry, &it);
+                               if (!IS_ERR(file)) {
+                                       err = deny_write_access(file);
+                                       if (err) {
+@@ -976,7 +984,7 @@ int do_coredump(long signr, struct pt_re
+               goto close_fail;
+       if (!file->f_op->write)
+               goto close_fail;
+-      if (do_truncate(file->f_dentry, 0) != 0)
++      if (do_truncate(file->f_dentry, 0, 0) != 0)
+               goto close_fail;
+       retval = binfmt->core_dump(signr, regs, file);
+--- linux-2.4.18-18.8.0-l12/include/linux/dcache.h~vfs_intent-2.4.18-18        Wed Feb 26 16:54:17 2003
++++ linux-2.4.18-18.8.0-l12-phil/include/linux/dcache.h        Wed Feb 26 17:01:30 2003
+@@ -6,6 +6,25 @@
  #include <asm/atomic.h>
  #include <linux/mount.h>
  
 +#define IT_CREAT    (1<<1)
 +#define IT_READDIR  (1<<2)
 +#define IT_GETATTR  (1<<3)
-+#define IT_SETATTR  (1<<4)
-+#define IT_TRUNC    (1<<5)
-+#define IT_READLINK (1<<6)
-+#define IT_LOOKUP   (1<<7)
++#define IT_LOOKUP   (1<<4)
++#define IT_UNLINK   (1<<5)
 +
 +struct lookup_intent {
 +      int it_op;
  /*
   * linux/include/linux/dcache.h
   *
-@@ -78,6 +99,7 @@ struct dentry {
+@@ -78,6 +97,7 @@ struct dentry {
        unsigned long d_time;           /* used by d_revalidate */
        struct dentry_operations  *d_op;
        struct super_block * d_sb;      /* The root of the dentry tree */
        unsigned long d_vfs_flags;
        void * d_fsdata;                /* fs-specific data */
        void * d_extra_attributes;      /* TUX-specific data */
-@@ -91,6 +113,8 @@ struct dentry_operations {
+@@ -91,8 +111,15 @@ struct dentry_operations {
        int (*d_delete)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
 +      void (*d_intent_release)(struct dentry *, struct lookup_intent *);
  };
  
++/* defined in fs/namei.c */
++extern void intent_release(struct dentry *de, struct lookup_intent *it);
++/* defined in fs/dcache.c */
++extern void __d_rehash(struct dentry * entry, int lock);
++
  /* the dentry parameter passed to d_hash and d_compare is the parent
-@@ -124,6 +148,7 @@ d_iput:            no              no              yes
+  * directory of the entries to be compared. It is used in case these
+  * functions need any directory specific information for determining
+@@ -124,6 +151,7 @@ d_iput:            no              no              yes
                                         * s_nfsd_free_path semaphore will be down
                                         */
  #define DCACHE_REFERENCED     0x0008  /* Recently used, don't discard. */
  
  extern spinlock_t dcache_lock;
  
---- linux-2.4.18-49chaos-lustre9/include/linux/fs.h~vfs_intent-2.4.18-18       Wed Jan 29 12:43:32 2003
-+++ linux-2.4.18-49chaos-lustre9-root/include/linux/fs.h       Wed Jan 29 12:43:32 2003
-@@ -576,6 +576,7 @@ struct file {
+--- linux-2.4.18-18.8.0-l12/include/linux/fs.h~vfs_intent-2.4.18-18    Wed Feb 26 16:54:17 2003
++++ linux-2.4.18-18.8.0-l12-phil/include/linux/fs.h    Wed Feb 26 17:31:42 2003
+@@ -338,6 +338,8 @@ extern void set_bh_page(struct buffer_he
+ #define ATTR_MTIME_SET        256
+ #define ATTR_FORCE    512     /* Not a change, but a change it */
+ #define ATTR_ATTR_FLAG        1024
++#define ATTR_RAW      2048    /* file system, not vfs will massage attrs */
++#define ATTR_FROM_OPEN        4096    /* called from open path, ie O_TRUNC */
+ /*
+  * This is the Inode Attributes structure, used for notify_change().  It
+@@ -576,6 +578,7 @@ struct file {
  
        /* needed for tty driver, and maybe others */
        void                    *private_data;
  
        /* preallocated helper kiobuf to speedup O_DIRECT */
        struct kiobuf           *f_iobuf;
-@@ -836,7 +837,9 @@ extern int vfs_symlink(struct inode *, s
+@@ -836,7 +839,9 @@ extern int vfs_symlink(struct inode *, s
  extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
  extern int vfs_rmdir(struct inode *, struct dentry *);
  extern int vfs_unlink(struct inode *, struct dentry *);
  
  /*
   * File types
-@@ -897,16 +900,28 @@ struct file_operations {
+@@ -897,20 +902,33 @@ struct file_operations {
  struct inode_operations {
        int (*create) (struct inode *,struct dentry *,int);
        struct dentry * (*lookup) (struct inode *,struct dentry *);
        void (*truncate) (struct inode *);
        int (*permission) (struct inode *, int);
        int (*revalidate) (struct dentry *);
-@@ -1383,6 +1398,7 @@ typedef int (*read_actor_t)(read_descrip
+       int (*setattr) (struct dentry *, struct iattr *);
++      int (*setattr_raw) (struct inode *, struct iattr *);
+       int (*getattr) (struct dentry *, struct iattr *);
+ };
+@@ -1112,7 +1130,7 @@ static inline int get_lease(struct inode
+ asmlinkage long sys_open(const char *, int, int);
+ asmlinkage long sys_close(unsigned int);      /* yes, it's really unsigned */
+-extern int do_truncate(struct dentry *, loff_t start);
++extern int do_truncate(struct dentry *, loff_t start, int called_from_open);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
+@@ -1381,6 +1399,7 @@ typedef int (*read_actor_t)(read_descrip
  extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
  
  extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
  extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
  extern int FASTCALL(path_walk(const char *, struct nameidata *));
  extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
-@@ -1394,6 +1410,8 @@ extern struct dentry * lookup_one_len(co
+@@ -1392,6 +1411,8 @@ extern struct dentry * lookup_one_len(co
  extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
  #define user_path_walk(name,nd)        __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
  #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
  
  extern void inode_init_once(struct inode *);
  extern void iput(struct inode *);
-@@ -1494,6 +1512,8 @@ extern struct file_operations generic_ro
+@@ -1492,6 +1513,8 @@ extern struct file_operations generic_ro
  
  extern int vfs_readlink(struct dentry *, char *, int, const char *);
  extern int vfs_follow_link(struct nameidata *, const char *);
  extern int page_readlink(struct dentry *, char *, int);
  extern int page_follow_link(struct dentry *, struct nameidata *);
  extern struct inode_operations page_symlink_inode_operations;
---- linux-2.4.18-49chaos-lustre9/kernel/ksyms.c~vfs_intent-2.4.18-18   Wed Jan 29 12:43:32 2003
-+++ linux-2.4.18-49chaos-lustre9-root/kernel/ksyms.c   Wed Jan 29 12:43:32 2003
-@@ -294,6 +294,7 @@ EXPORT_SYMBOL(read_cache_page);
+--- linux-2.4.18-18.8.0-l12/kernel/ksyms.c~vfs_intent-2.4.18-18        Wed Feb 26 16:54:17 2003
++++ linux-2.4.18-18.8.0-l12-phil/kernel/ksyms.c        Wed Feb 26 16:54:17 2003
+@@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page);
  EXPORT_SYMBOL(set_page_dirty);
  EXPORT_SYMBOL(vfs_readlink);
  EXPORT_SYMBOL(vfs_follow_link);
  EXPORT_SYMBOL(page_readlink);
  EXPORT_SYMBOL(page_follow_link);
  EXPORT_SYMBOL(page_symlink_inode_operations);
-
-_
index 63f09b3..fa0998a 100644 (file)
@@ -1,15 +1,16 @@
- fs/dcache.c            |    3 
- fs/namei.c             |  306 ++++++++++++++++++++++++++++++++++++++++---------
+ fs/dcache.c            |    8 
+ fs/namei.c             |  335 +++++++++++++++++----
  fs/nfsd/vfs.c          |    2 
- fs/open.c              |   63 +++++++---
- fs/stat.c              |   29 +++-
- include/linux/dcache.h |   31 ++++
- include/linux/fs.h     |   22 +++
+ fs/open.c              |  142 +++++++-
+ fs/stat.c              |   24 +
+ include/linux/dcache.h |   26 +
+ include/linux/fs.h     |   2+
  kernel/ksyms.c         |    1 
- 8 files changed, 384 insertions(+), 73 deletions(-)
-
---- linux-2.4.19-hp2_pnnl2/fs/dcache.c~vfs_intent_hp   Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl2-root/fs/dcache.c    Sun Jan 19 19:04:47 2003
+ fs/exec.c                   |   18 -
+ 9 files changed, 487 insertions(+), 96 deletions(-)
+--- linux-2.4.19-hp2_pnnl4/fs/dcache.c~vfs_intent_hp   Sun Jan 19 19:04:47 2003
++++ linux-2.4.19-hp2_pnnl4-root/fs/dcache.c    Sun Jan 19 19:04:47 2003
 @@ -186,6 +188,13 @@ int d_invalidate(struct dentry * dentry)
                spin_unlock(&dcache_lock);
                return 0;
        INIT_LIST_HEAD(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
---- linux-2.4.19-hp2_pnnl2/fs/namei.c~vfs_intent_hp    Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl2-root/fs/namei.c     Sun Jan 19 19:35:55 2003
+@@ -859,13 +867,19 @@ void d_delete(struct dentry * dentry)
+  * Adds a dentry to the hash according to its name.
+  */
+  
+-void d_rehash(struct dentry * entry)
++void __d_rehash(struct dentry * entry, int lock)
+ {
+       struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+       if (!list_empty(&entry->d_hash)) BUG();
+-      spin_lock(&dcache_lock);
++      if (lock) spin_lock(&dcache_lock);
+       list_add(&entry->d_hash, list);
+-      spin_unlock(&dcache_lock);
++      if (lock) spin_unlock(&dcache_lock);
++}
++EXPORT_SYMBOL(__d_rehash);
++
++void d_rehash(struct dentry * entry)
++{
++      __d_rehash(entry, 1);
+ }
+ #define do_switch(x,y) do { \
+--- linux-2.4.19-hp2_pnnl4/fs/namei.c~vfs_intent_hp    Sun Jan 19 19:04:47 2003
++++ linux-2.4.19-hp2_pnnl4-root/fs/namei.c     Sun Jan 19 19:35:55 2003
 @@ -94,6 +97,13 @@
   * XEmacs seems to be relying on it...
   */
        path_release(nd);
        return -ELOOP;
  }
+@@ -381,15 +416,26 @@ int follow_up(struct vfsmount **mnt, str
+       return __follow_up(mnt, dentry);
+ }
+-static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
++static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry,
++                              struct lookup_intent *it)
+ {
+       struct vfsmount *mounted;
+       spin_lock(&dcache_lock);
+       mounted = lookup_mnt(*mnt, *dentry);
+       if (mounted) {
++              int opc = 0, mode = 0;
+               *mnt = mntget(mounted);
+               spin_unlock(&dcache_lock);
++              if (it) {
++                      opc = it->it_op;
++                      mode = it->it_mode;
++              }
++              intent_release(*dentry, it);
++              if (it) {
++                      it->it_op = opc;
++                      it->it_mode = mode;
++              }
+               dput(*dentry);
+               mntput(mounted->mnt_parent);
+               *dentry = dget(mounted->mnt_root);
+@@ -401,7 +447,7 @@ static inline int __follow_down(struct v
+ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
+ {
+-      return __follow_down(mnt,dentry);
++      return __follow_down(mnt,dentry,NULL);
+ }
+  
+ static inline void follow_dotdot(struct nameidata *nd)
+@@ -437,7 +483,7 @@ static inline void follow_dotdot(struct 
+               mntput(nd->mnt);
+               nd->mnt = parent;
+       }
+-      while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry))
++      while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry, NULL))
+               ;
+ }
 @@ -447,7 +482,8 @@ static inline void follow_dotdot(struct 
   *
   * We expect 'base' to be positive and a directory.
  {
        struct dentry *dentry;
        struct inode *inode;
-@@ -520,9 +556,9 @@ int link_path_walk(const char * name, st
+@@ -520,15 +556,15 @@ int link_path_walk(const char * name, st
                                break;
                }
                /* This does the actual lookups.. */
                        err = PTR_ERR(dentry);
                        if (IS_ERR(dentry))
                                break;
+               }
+               /* Check mountpoints.. */
+-              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry))
++              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, NULL))
+                       ;
+               err = -ENOENT;
 @@ -539,8 +575,8 @@ int link_path_walk(const char * name, st
                if (!inode->i_op)
                        goto out_dput;
                        break;
                continue;
                /* here ends the main loop */
-@@ -583,9 +619,9 @@ last_component:
+@@ -583,19 +619,20 @@ last_component:
                        if (err < 0)
                                break;
                }
                        err = PTR_ERR(dentry);
                        if (IS_ERR(dentry))
                                break;
-@@ -594,8 +630,9 @@ last_component:
+               }
+-              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry))
++              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, it))
                        ;
                inode = dentry->d_inode;
                if ((lookup_flags & LOOKUP_FOLLOW)
        /* Negative dentry, just create the file */
        if (!dentry->d_inode) {
                if (!IS_POSIXACL(dir->d_inode))
-@@ -1071,7 +1156,8 @@ do_last:
+@@ -1066,12 +1151,13 @@ do_last:
+               error = -ELOOP;
+               if (flag & O_NOFOLLOW)
+                       goto exit_dput;
+-              while (__follow_down(&nd->mnt,&dentry) && d_mountpoint(dentry));
++              while (__follow_down(&nd->mnt,&dentry,it) && d_mountpoint(dentry));
+       }
        error = -ENOENT;
        if (!dentry->d_inode)
                goto exit_dput;
                goto do_link;
  
        dput(nd->dentry);
+@@ -1145,7 +1231,7 @@ do_last:
+               if (!error) {
+                       DQUOT_INIT(inode);
+                       
+-                      error = do_truncate(dentry, 0);
++                      error = do_truncate(dentry, 0, 1);
+               }
+               put_write_access(inode);
+               if (error)
 @@ -1157,8 +1243,10 @@ ok:
        return 0;
  
 +                                 nd.last.name,
 +                                 nd.last.len,
 +                                 mode, dev);
-+              /* the file system want to use normal vfs path now */
++              /* the file system wants to use normal vfs path now */
 +              if (error != -EOPNOTSUPP)
 +                      goto out2;
 +      }
 +                                         nd.last.name,
 +                                         nd.last.len,
 +                                         mode);
-+                      /* the file system want to use normal vfs path now */
++                      /* the file system wants to use normal vfs path now */
 +                      if (error != -EOPNOTSUPP)
 +                              goto out2;
 +              }
                path_release(&nd);
  out:
                putname(tmp);
-@@ -1450,8 +1578,17 @@ asmlinkage long sys_rmdir(const char * p
+@@ -1450,8 +1578,33 @@ asmlinkage long sys_rmdir(const char * p
                        error = -EBUSY;
                        goto exit1;
        }
 +      if (nd.dentry->d_inode->i_op->rmdir2) {
 +              struct inode_operations *op = nd.dentry->d_inode->i_op;
++              struct dentry *last;
++
++              down(&nd.dentry->d_inode->i_sem);
++              last = lookup_hash_it(&nd.last, nd.dentry, NULL);
++              up(&nd.dentry->d_inode->i_sem);
++              if (IS_ERR(last)) {
++                      error = PTR_ERR(last);
++                      goto exit1;
++              }
++              if (d_mountpoint(last)) {
++                      dput(last);
++                      error = -EBUSY;
++                      goto exit1;
++              }
++              dput(last);
++
 +              error = op->rmdir2(nd.dentry->d_inode,
 +                                 nd.last.name,
 +                                 nd.last.len);
-+              /* the file system want to use normal vfs path now */
++              /* the file system wants to use normal vfs path now */
 +              if (error != -EOPNOTSUPP)
 +                      goto exit1;
 +      }
 +              error = op->unlink2(nd.dentry->d_inode,
 +                                  nd.last.name,
 +                                  nd.last.len);
-+              /* the file system want to use normal vfs path now */
++              /* the file system wants to use normal vfs path now */
 +              if (error != -EOPNOTSUPP)
 +                      goto exit1;
 +      }
                if (error)
                        goto out;
 -              dentry = lookup_create(&nd, 0);
-+              if (nd.dentry->d_inode->i_op->symlink2) {
++              if (nd.dentry->d_inode->i_op->symlink2) {
 +                      struct inode_operations *op = nd.dentry->d_inode->i_op;
 +                      error = op->symlink2(nd.dentry->d_inode,
 +                                           nd.last.name,
 +                                           nd.last.len,
 +                                           from);
-+                      /* the file system want to use normal vfs path now */
++                      /* the file system wants to use normal vfs path now */
 +                      if (error != -EOPNOTSUPP)
 +                              goto out2;
 +              }
 +                                        nd.dentry->d_inode,
 +                                        nd.last.name,
 +                                        nd.last.len);
-+                      /* the file system want to use normal vfs path now */
++                      /* the file system wants to use normal vfs path now */
 +                      if (error != -EOPNOTSUPP)
 +                              goto out_release;
 +              }
        if (!error) {
                if (old_dir == new_dir)
                        inode_dir_notify(old_dir, DN_RENAME);
-@@ -1886,9 +2068,23 @@ static inline int do_rename(const char *
-       if (newnd.last_type != LAST_NORM)
-               goto exit2;
+@@ -1888,7 +2070,7 @@ static inline int do_rename(const char *
  
-+      if (old_dir->d_inode->i_op->rename2) {
-+              lock_kernel();
-+              error = old_dir->d_inode->i_op->rename2(old_dir->d_inode,
-+                                                      new_dir->d_inode,
-+                                                      oldnd.last.name,
-+                                                      oldnd.last.len,
-+                                                      newnd.last.name,
-+                                                      newnd.last.len);
-+              unlock_kernel();
-+              /* the file system want to use normal vfs path now */
-+              if (error != -EOPNOTSUPP)
-+                      goto exit2;
-+      }
-+
        double_lock(new_dir, old_dir);
  
 -      old_dentry = lookup_hash(&oldnd.last, old_dir);
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
-@@ -1904,14 +2100,14 @@ static inline int do_rename(const char *
+@@ -1904,16 +2086,37 @@ static inline int do_rename(const char *
                if (newnd.last.name[newnd.last.len])
                        goto exit4;
        }
        if (IS_ERR(new_dentry))
                goto exit4;
  
++      if (old_dir->d_inode->i_op->rename2) {
++              lock_kernel();
++              /* don't rename mount point. mds will take care of
++               * the rest sanity checking */
++              if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) {
++                      error = -EBUSY;
++                      goto exit5;
++              }
++
++              error = old_dir->d_inode->i_op->rename2(old_dir->d_inode,
++                                                      new_dir->d_inode,
++                                                      oldnd.last.name,
++                                                      oldnd.last.len,
++                                                      newnd.last.name,
++                                                      newnd.last.len);
++              unlock_kernel();
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto exit5;
++      }
++
        lock_kernel();
        error = vfs_rename(old_dir->d_inode, old_dentry,
 -                                 new_dir->d_inode, new_dentry);
 +                                 new_dir->d_inode, new_dentry, NULL);
        unlock_kernel();
+-
++exit5:
        dput(new_dentry);
+ exit4:
+       dput(old_dentry);
 @@ -1964,7 +2163,8 @@ out:
  }
  
        if (page) {
                kunmap(page);
                page_cache_release(page);
---- linux-2.4.19-hp2_pnnl2/fs/nfsd/vfs.c~vfs_intent_hp Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl2-root/fs/nfsd/vfs.c  Sun Jan 19 19:37:57 2003
+--- linux-2.4.19-hp2_pnnl4/fs/nfsd/vfs.c~vfs_intent_hp Sun Jan 19 19:04:47 2003
++++ linux-2.4.19-hp2_pnnl4-root/fs/nfsd/vfs.c  Sun Jan 19 19:37:57 2003
 @@ -1295,7 +1295,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
                        err = nfserr_perm;
        } else
        if (!err && EX_ISSYNC(tfhp->fh_export)) {
                nfsd_sync_dir(tdentry);
                nfsd_sync_dir(fdentry);
---- linux-2.4.19-hp2_pnnl2/fs/open.c~vfs_intent_hp     Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl2-root/fs/open.c      Sun Jan 19 19:41:00 2003
-@@ -19,6 +19,9 @@
+--- linux-2.4.19-hp2_pnnl4/fs/open.c~vfs_intent_hp     Sun Jan 19 19:04:47 2003
++++ linux-2.4.19-hp2_pnnl4-root/fs/open.c      Sun Jan 19 19:41:00 2003
+@@ -19,6 +19,8 @@
  #include <asm/uaccess.h>
  
  #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
 +extern int path_walk_it(const char *name, struct nameidata *nd,
 +                      struct lookup_intent *it);
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
  
  int vfs_statfs(struct super_block *sb, struct statfs *buf)
  {
+@@ -95,9 +97,10 @@ void fd_install(unsigned int fd, struct 
+       write_unlock(&files->file_lock);
+ }
+-int do_truncate(struct dentry *dentry, loff_t length)
++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open)
+ {
+       struct inode *inode = dentry->d_inode;
++      struct inode_operations *op = dentry->d_inode->i_op;
+       int error;
+       struct iattr newattrs;
+@@ -108,7 +111,14 @@ int do_truncate(struct dentry *dentry, l
+       down(&inode->i_sem);
+       newattrs.ia_size = length;
+       newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+-      error = notify_change(dentry, &newattrs);
++      if (called_from_open)
++              newattrs.ia_valid |= ATTR_FROM_OPEN;
++      if (op->setattr_raw) {
++              newattrs.ia_valid |= ATTR_RAW;
++              newattrs.ia_ctime = CURRENT_TIME;
++              error = op->setattr_raw(inode, &newattrs);
++      } else 
++              error = notify_change(dentry, &newattrs);
+       up(&inode->i_sem);
+       return error;
+ }
 @@ -118,12 +121,13 @@ static inline long do_sys_truncate(const
        struct nameidata nd;
        struct inode * inode;
        int error;
-+      struct lookup_intent it = { .it_op = IT_TRUNC };
++      struct lookup_intent it = { .it_op = IT_GETATTR };
  
        error = -EINVAL;
        if (length < 0) /* sorry, but loff_t says... */
        if (error)
                goto out;
        inode = nd.dentry->d_inode;
-@@ -168,6 +172,7 @@ static inline long do_sys_truncate(const
+@@ -163,11 +167,13 @@ static inline long do_sys_truncate(const
+       error = locks_verify_truncate(inode, NULL, length);
+       if (!error) {
+               DQUOT_INIT(inode);
+-              error = do_truncate(nd.dentry, length);
++              intent_release(nd.dentry, &it);
++              error = do_truncate(nd.dentry, length, 0);
+       }
        put_write_access(inode);
  
  dput_and_out:
        path_release(&nd);
  out:
        return error;
-@@ -259,8 +264,9 @@ asmlinkage long sys_utime(char * filenam
-       struct nameidata nd;
+@@ -215,7 +228,7 @@ static inline long do_sys_ftruncate(unsi
+       error = locks_verify_truncate(inode, file, length);
+       if (!error)
+-              error = do_truncate(dentry, length);
++              error = do_truncate(dentry, length, 0);
+ out_putf:
+       fput(file);
+ out:
+@@ -260,11 +273,13 @@ asmlinkage long sys_utime(char * filenam
        struct inode * inode;
        struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
  
 -      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
++      error = user_path_walk_it(filename, &nd, NULL);
        if (error)
                goto out;
        inode = nd.dentry->d_inode;
-@@ -286,6 +292,7 @@ asmlinkage long sys_utime(char * filenam
++      /* this is safe without a Lustre lock because it only depends
++         on the super block */
+       error = -EROFS;
+       if (IS_RDONLY(inode))
+               goto dput_and_out;
+@@ -279,11 +294,29 @@ asmlinkage long sys_utime(char * filenam
+                       goto dput_and_out;
+               newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+-      } else {
++      }
++
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto dput_and_out;
++      }
++
++      error = -EROFS;
++      if (IS_RDONLY(inode))
++              goto dput_and_out;
++
++      error = -EPERM;
++      if (!times) {
+               if (current->fsuid != inode->i_uid &&
+                   (error = permission(inode,MAY_WRITE)) != 0)
+                       goto dput_and_out;
        }
++
        error = notify_change(nd.dentry, &newattrs);
  dput_and_out:
-+      intent_release(nd.dentry, &it);
        path_release(&nd);
- out:
-       return error;
-@@ -303,8 +310,9 @@ asmlinkage long sys_utimes(char * filena
-       struct nameidata nd;
+@@ -304,12 +337,14 @@ asmlinkage long sys_utimes(char * filena
        struct inode * inode;
        struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
  
 -      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
++      error = user_path_walk_it(filename, &nd, NULL);
  
        if (error)
                goto out;
-@@ -331,6 +339,7 @@ asmlinkage long sys_utimes(char * filena
-       }
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
+       inode = nd.dentry->d_inode;
++      /* this is safe without a Lustre lock because it only depends
++         on the super block */
+       error = -EROFS;
+       if (IS_RDONLY(inode))
+               goto dput_and_out;
+@@ -324,7 +359,20 @@ asmlinkage long sys_utimes(char * filena
+               newattrs.ia_atime = times[0].tv_sec;
+               newattrs.ia_mtime = times[1].tv_sec;
+               newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+-      } else {
++      }
++
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto dput_and_out;
++      }
++
++      error = -EPERM;
++      if (!utimes) {
+               if (current->fsuid != inode->i_uid &&
+                   (error = permission(inode,MAY_WRITE)) != 0)
+                       goto dput_and_out;
 @@ -347,6 +356,7 @@ asmlinkage long sys_access(const char * 
        int old_fsuid, old_fsgid;
        kernel_cap_t old_cap;
        path_release(&nd);
  out:
        return error;
-@@ -515,8 +530,9 @@ asmlinkage long sys_chmod(const char * f
-       struct inode * inode;
-       int error;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
+@@ -508,6 +564,18 @@ asmlinkage long sys_chmod(const char * f
+       if (IS_RDONLY(inode))
+               goto dput_and_out;
  
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++              newattrs.ia_mode = mode;
++              newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto dput_and_out;
++      }
++
+       error = -EPERM;
+       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+               goto dput_and_out;
+@@ -538,6 +606,20 @@ static int chown_common(struct dentry * 
+       error = -EROFS;
+       if (IS_RDONLY(inode))
+               goto out;
++
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = dentry->d_inode->i_op;
++
++              newattrs.ia_uid = user;
++              newattrs.ia_gid = group;
++              newattrs.ia_valid = ATTR_UID | ATTR_GID;
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      return error;
++      }
++
+       error = -EPERM;
+       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                goto out;
-       inode = nd.dentry->d_inode;
-@@ -536,6 +552,7 @@ asmlinkage long sys_chmod(const char * f
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -605,10 +622,12 @@ asmlinkage long sys_chown(const char * f
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -618,10 +637,12 @@ asmlinkage long sys_lchown(const char * 
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
 @@ -655,10 +676,16 @@ asmlinkage long sys_fchown(unsigned int 
   * for the internal routines (ie open_namei()/follow_link() etc). 00 is
   * used by symlinks.
  /*
   * Find an empty file descriptor entry, and mark it busy.
   */
---- linux-2.4.19-hp2_pnnl2/fs/stat.c~vfs_intent_hp     Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl2-root/fs/stat.c      Sun Jan 19 19:44:51 2003
-@@ -13,6 +13,7 @@
- #include <asm/uaccess.h>
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
- /*
-  * Revalidate the inode. This is required for proper NFS attribute caching.
-  */
+--- linux-2.4.19-hp2_pnnl4/fs/stat.c~vfs_intent_hp     Sun Jan 19 19:04:47 2003
++++ linux-2.4.19-hp2_pnnl4-root/fs/stat.c      Sun Jan 19 19:44:51 2003
 @@ -135,13 +136,15 @@ static int cp_new_stat(struct inode * in
  asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf)
  {
                path_release(&nd);
        }
        return error;
-@@ -247,11 +256,12 @@ asmlinkage long sys_readlink(const char 
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_READLINK };
-       if (bufsiz <= 0)
-               return -EINVAL;
--      error = user_path_walk_link(path, &nd);
-+      error = user_path_walk_link_it(path, &nd, &it);
-       if (!error) {
-               struct inode * inode = nd.dentry->d_inode;
-@@ -261,6 +271,7 @@ asmlinkage long sys_readlink(const char 
-                       UPDATE_ATIME(inode);
-                       error = inode->i_op->readlink(nd.dentry, buf, bufsiz);
-               }
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
 @@ -333,12 +344,14 @@ asmlinkage long sys_stat64(char * filena
  {
        struct nameidata nd;
                path_release(&nd);
        }
        return error;
---- linux-2.4.19-hp2_pnnl2/include/linux/dcache.h~vfs_intent_hp        Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl2-root/include/linux/dcache.h Sun Jan 19 19:04:48 2003
-@@ -6,6 +6,27 @@
+--- linux-2.4.19-hp2_pnnl4/fs/exec.c~vfs_intent_hp     Sun Feb  9 01:14:52 2003
++++ linux-2.4.19-hp2_pnnl4-root/fs/exec.c      Sun Feb  9 01:29:49 2003
+@@ -103,13 +104,18 @@ static inline void put_binfmt(struct lin
+  *
+  * Also note that we take the address to load from from the file itself.
+  */
++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
++                          int flags, struct lookup_intent *it);
++int path_lookup_it(const char *path, unsigned flags, struct nameidata *nd,
++                 struct lookup_intent *it);
+ asmlinkage long sys_uselib(const char * library)
+ {
+       struct file * file;
+       struct nameidata nd;
+       int error;
++      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY };
+-      error = user_path_walk(library, &nd);
++      error = user_path_walk_it(library, &nd, &it);
+       if (error)
+               goto out;
+@@ -121,7 +127,8 @@ asmlinkage long sys_uselib(const char * 
+       if (error)
+               goto exit;
+-      file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++      file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
++      intent_release(nd.dentry, &it);
+       error = PTR_ERR(file);
+       if (IS_ERR(file))
+               goto out;
+@@ -350,9 +350,10 @@ struct file *open_exec(const char *name)
+       struct inode *inode;
+       struct file *file;
+       int err = 0;
++      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY };
+       if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
+-              err = path_walk(name, &nd);
++              err = path_walk_it(name, &nd, &it);
+       file = ERR_PTR(err);
+       if (!err) {
+               inode = nd.dentry->d_inode;
+@@ -363,7 +369,8 @@ struct file *open_exec(const char *name)
+                               err = -EACCES;
+                       file = ERR_PTR(err);
+                       if (!err) {
+-                              file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++                              file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
++                              intent_release(nd.dentry, &it);
+                               if (!IS_ERR(file)) {
+                                       err = deny_write_access(file);
+                                       if (err) {
+@@ -976,7 +986,7 @@ int do_coredump(long signr, struct pt_re
+               goto close_fail;
+       if (!file->f_op->write)
+               goto close_fail;
+-      if (do_truncate(file->f_dentry, 0) != 0)
++      if (do_truncate(file->f_dentry, 0, 0) != 0)
+               goto close_fail;
+       retval = binfmt->core_dump(signr, regs, file);
+--- linux-2.4.19-hp2_pnnl4/include/linux/dcache.h~vfs_intent_hp        Sun Jan 19 19:04:47 2003
++++ linux-2.4.19-hp2_pnnl4-root/include/linux/dcache.h Sun Jan 19 19:04:48 2003
+@@ -6,6 +6,25 @@
  #include <asm/atomic.h>
  #include <linux/mount.h>
  
 +#define IT_CREAT    (1<<1)
 +#define IT_READDIR  (1<<2)
 +#define IT_GETATTR  (1<<3)
-+#define IT_SETATTR  (1<<4)
-+#define IT_TRUNC    (1<<5)
-+#define IT_READLINK (1<<6)
-+#define IT_LOOKUP   (1<<7)
++#define IT_LOOKUP   (1<<4)
++#define IT_UNLINK   (1<<5)
 +
 +struct lookup_intent {
 +      int it_op;
        unsigned long d_vfs_flags;
        void * d_fsdata;                /* fs-specific data */
        unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
-@@ -90,6 +119,8 @@ struct dentry_operations {
+@@ -90,8 +119,15 @@ struct dentry_operations {
        int (*d_delete)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
 +      void (*d_intent_release)(struct dentry *, struct lookup_intent *);
  };
  
++/* defined in fs/namei.c */
++extern void intent_release(struct dentry *de, struct lookup_intent *it);
++/* defined in fs/dcache.c */
++extern void __d_rehash(struct dentry * entry, int lock);
++
  /* the dentry parameter passed to d_hash and d_compare is the parent
-@@ -124,6 +148,7 @@ d_iput:            no              no              yes
+  * directory of the entries to be compared. It is used in case these
+  * functions need any directory specific information for determining
+@@ -124,6 +149,7 @@ d_iput:            no              no              yes
                                         * s_nfsd_free_path semaphore will be down
                                         */
  #define DCACHE_REFERENCED     0x0008  /* Recently used, don't discard. */
  
  extern spinlock_t dcache_lock;
  
---- linux-2.4.19-hp2_pnnl2/include/linux/fs.h~vfs_intent_hp    Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl2-root/include/linux/fs.h     Sun Jan 19 19:04:48 2003
+--- linux-2.4.19-hp2_pnnl4/include/linux/fs.h~vfs_intent_hp    Sun Jan 19 19:04:47 2003
++++ linux-2.4.19-hp2_pnnl4-root/include/linux/fs.h     Sun Jan 19 19:04:48 2003
+@@ -338,6 +338,8 @@ extern void set_bh_page(struct buffer_he
+ #define ATTR_MTIME_SET        256
+ #define ATTR_FORCE    512     /* Not a change, but a change it */
+ #define ATTR_ATTR_FLAG        1024
++#define ATTR_RAW      2048    /* file system, not vfs will massage attrs */
++#define ATTR_FROM_OPEN        4096    /* called from open path, ie O_TRUNC */
+ /*
+  * This is the Inode Attributes structure, used for notify_change().  It
 @@ -575,6 +575,7 @@ struct file {
  
        /* needed for tty driver, and maybe others */
  
  /*
   * File types
-@@ -876,16 +879,28 @@ struct file_operations {
+@@ -876,20 +879,33 @@ struct file_operations {
  struct inode_operations {
        int (*create) (struct inode *,struct dentry *,int);
        struct dentry * (*lookup) (struct inode *,struct dentry *);
        void (*truncate) (struct inode *);
        int (*permission) (struct inode *, int);
        int (*revalidate) (struct dentry *);
+       int (*setattr) (struct dentry *, struct iattr *);
++      int (*setattr_raw) (struct inode *, struct iattr *);
+       int (*getattr) (struct dentry *, struct iattr *);
+       int (*setxattr) (struct dentry *, const char *, void *, size_t, int);
+       ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+@@ -1112,7 +1130,7 @@ static inline int get_lease(struct inode
+ asmlinkage long sys_open(const char *, int, int);
+ asmlinkage long sys_close(unsigned int);      /* yes, it's really unsigned */
+-extern int do_truncate(struct dentry *, loff_t start);
++extern int do_truncate(struct dentry *, loff_t start, int called_from_open);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
 @@ -1354,6 +1369,7 @@ typedef int (*read_actor_t)(read_descrip
  extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
  
  extern int page_readlink(struct dentry *, char *, int);
  extern int page_follow_link(struct dentry *, struct nameidata *);
  extern struct inode_operations page_symlink_inode_operations;
---- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~vfs_intent_hp        Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 19:04:48 2003
+--- linux-2.4.19-hp2_pnnl4/kernel/ksyms.c~vfs_intent_hp        Sun Jan 19 19:04:47 2003
++++ linux-2.4.19-hp2_pnnl4-root/kernel/ksyms.c Sun Jan 19 19:04:48 2003
 @@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page);
  EXPORT_SYMBOL(set_page_dirty);
  EXPORT_SYMBOL(vfs_readlink);
  EXPORT_SYMBOL(page_readlink);
  EXPORT_SYMBOL(page_follow_link);
  EXPORT_SYMBOL(page_symlink_inode_operations);
-
-_
diff --git a/lustre/kernel_patches/pc/lustre-2.5.pc b/lustre/kernel_patches/pc/lustre-2.5.pc
new file mode 100644 (file)
index 0000000..71434ea
--- /dev/null
@@ -0,0 +1,11 @@
+arch/um/kernel/mem.c
+fs/namei.c
+fs/nfsd/vfs.c
+fs/sysfs/inode.c
+include/linux/dcache.h
+include/linux/fs.h
+include/linux/namei.h
+include/linux/slab.h
+kernel/ksyms.c
+mm/slab.c
+net/unix/af_unix.c
index 881576c..dd2b1c8 100644 (file)
@@ -3,6 +3,7 @@ fs/namei.c
 fs/nfsd/vfs.c
 fs/open.c
 fs/stat.c
+fs/exec.c
 include/linux/dcache.h
 include/linux/fs.h
 kernel/ksyms.c
diff --git a/lustre/kernel_patches/series/vanilla-2.5 b/lustre/kernel_patches/series/vanilla-2.5
new file mode 100644 (file)
index 0000000..3269420
--- /dev/null
@@ -0,0 +1,2 @@
+lustre_version.patch
+lustre-2.5.patch
index 4a5e662..b7af3d9 100644 (file)
@@ -6,3 +6,5 @@ series/hp-pnnl              ** Note: functionally equivalent to 2.4.19
    linux-2.4.18-hp2_pnnl2
 series/vanilla-2.4.19       ** Not officially supported
    linux-2.4.19
+series/lin-2.5.44
+   uml-2.5.44
index a76ff4a..d0c4199 100644 (file)
@@ -4,12 +4,21 @@
 # See the file COPYING in this distribution
 
 DEFS= 
+
+LDLMSOURCES= l_lock.c ldlm_lock.c ldlm_resource.c  \
+ldlm_extent.c ldlm_request.c ldlm_lockd.c
+
+if LIBLUSTRE
+lib_LIBRARIES = libldlm.a
+libldlm_a_SOURCES = $(LDLMSOURCES)
+else
 MODULE = ldlm
 modulefs_DATA = ldlm.o
 EXTRA_PROGRAMS = ldlm
 
-ldlm_SOURCES = l_lock.c ldlm_lock.c ldlm_resource.c ldlm_lockd.c \
-ldlm_extent.c ldlm_request.c
+ldlm_SOURCES = $(LDLMSOURCES)
+endif
 
 include $(top_srcdir)/Rules
 
+
index e8ffd5b..c439eed 100644 (file)
@@ -20,6 +20,8 @@
  *
  */
 
+#define DEBUG_SUBSYSTEM S_LDLM
+#ifdef __KERNEL__
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
+#else 
+#include <liblustre.h>
+#endif
 
-#define DEBUG_SUBSYSTEM S_LDLM
-
+#include <linux/lustre_dlm.h>
 #include <linux/obd_class.h>
 #include <linux/lustre_lib.h>
 
index 5a84909..9b10854 100644 (file)
@@ -1,19 +1,34 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- * Copyright (C) 2002 Cluster File Systems, Inc.
+ *  Copyright (c) 2002 Cluster File Systems, Inc.
+ *   Author: Peter Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
  *
- * This code is issued under the GNU General Public License.
- * See the file COPYING in this distribution
+ *   This file is part of Lustre, http://www.lustre.org.
  *
- * by Cluster File Systems, Inc.
- * authors, Peter Braam <braam@clusterfs.com> & 
- *          Phil Schwan <phil@clusterfs.com>
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #define DEBUG_SUBSYSTEM S_LDLM
+#ifndef __KERNEL__
+# include <liblustre.h>
+#endif
 
 #include <linux/lustre_dlm.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_lib.h>
 
 /* This function will be called to judge if the granted queue of another child
  * (read: another extent) is conflicting and needs its granted queue walked to
@@ -47,17 +62,17 @@ static void policy_internal(struct list_head *queue, struct ldlm_extent *req_ex,
                 struct ldlm_lock *lock;
                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-                if (lock->l_extent.end < req_ex->start)
+                if (lock->l_extent.end < req_ex->start) {
                         new_ex->start = MIN(lock->l_extent.end, new_ex->start);
-                else {
+                else {
                         if (lock->l_extent.start < req_ex->start &&
                             !lockmode_compat(lock->l_req_mode, mode))
                                 /* Policy: minimize conflict overlap */
                                 new_ex->start = req_ex->start;
                 }
-                if (lock->l_extent.start > req_ex->end)
+                if (lock->l_extent.start > req_ex->end) {
                         new_ex->end = MAX(lock->l_extent.start, new_ex->end);
-                else {
+                else {
                         if (lock->l_extent.end > req_ex->end &&
                             !lockmode_compat(lock->l_req_mode, mode))
                                 /* Policy: minimize conflict overlap */
index e08ddb4..81cc428 100644 (file)
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
+#ifdef __KERNEL__
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/lustre_dlm.h>
 #include <linux/lustre_mds.h>
+#else
+#include <liblustre.h>
+#include <linux/kp30.h>
+#endif
+
 #include <linux/obd_class.h>
 
 //struct lustre_lock ldlm_everything_lock;
@@ -46,6 +52,7 @@ char *ldlm_typename[] = {
         [LDLM_EXTENT] "EXT",
 };
 
+#ifdef __KERNEL__
 char *ldlm_it2str(int it)
 {
         switch (it) {
@@ -59,10 +66,6 @@ char *ldlm_it2str(int it)
                 return "readdir";
         case IT_GETATTR:
                 return "getattr";
-        case IT_TRUNC:
-                return "truncate";
-        case IT_SETATTR:
-                return "setattr";
         case IT_LOOKUP:
                 return "lookup";
         case IT_UNLINK:
@@ -72,13 +75,14 @@ char *ldlm_it2str(int it)
                 return "UNKNOWN";
         }
 }
+#endif
 
 extern kmem_cache_t *ldlm_lock_slab;
 struct lustre_lock ldlm_handle_lock;
 
 static int ldlm_plain_compat(struct ldlm_lock *a, struct ldlm_lock *b);
 
-ldlm_res_compat ldlm_res_compat_table[] = {
+static ldlm_res_compat ldlm_res_compat_table[] = {
         [LDLM_PLAIN] ldlm_plain_compat,
         [LDLM_EXTENT] ldlm_extent_compat,
 };
@@ -97,7 +101,7 @@ static int ldlm_plain_policy(struct ldlm_namespace *ns, struct ldlm_lock **lock,
         return ELDLM_OK;
 }
 
-ldlm_res_policy ldlm_res_policy_table[] = {
+static ldlm_res_policy ldlm_res_policy_table[] = {
         [LDLM_PLAIN] ldlm_plain_policy,
         [LDLM_EXTENT] ldlm_extent_policy,
 };
@@ -136,7 +140,7 @@ void ldlm_lock_put(struct ldlm_lock *lock)
 
         if (atomic_dec_and_test(&lock->l_refc)) {
                 l_lock(&ns->ns_lock);
-                LDLM_DEBUG0(lock, "final lock_put on destroyed lock, freeing");
+                LDLM_DEBUG(lock, "final lock_put on destroyed lock, freeing");
                 LASSERT(lock->l_destroyed);
                 LASSERT(list_empty(&lock->l_res_link));
 
@@ -207,7 +211,7 @@ void ldlm_lock_destroy(struct ldlm_lock *lock)
 
         list_del_init(&lock->l_export_chain);
         ldlm_lock_remove_from_lru(lock);
-        portals_handle_unhash(&lock->l_handle);
+        class_handle_unhash(&lock->l_handle);
 
 #if 0
         /* Wake anyone waiting for this lock */
@@ -270,7 +274,7 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_lock *parent,
         }
 
         INIT_LIST_HEAD(&lock->l_handle.h_link);
-        portals_handle_hash(&lock->l_handle, lock_handle_addref);
+        class_handle_hash(&lock->l_handle, lock_handle_addref);
 
         RETURN(lock);
 }
@@ -314,7 +318,7 @@ int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
 
 void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh)
 {
-        memset(&lockh->addr, 0x69, sizeof(lockh->addr));
+        POISON(&lockh->addr, 0x69, sizeof(lockh->addr));
         lockh->cookie = lock->l_handle.h_cookie;
 }
 
@@ -329,7 +333,7 @@ struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *handle, int flags)
 
         LASSERT(handle);
 
-        lock = portals_handle2object(handle->cookie);
+        lock = class_handle2object(handle->cookie);
         if (lock == NULL)
                 RETURN(NULL);
 
@@ -388,7 +392,7 @@ void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc)
 }
 
 static void ldlm_add_ast_work_item(struct ldlm_lock *lock,
-                                   struct ldlm_lock *new, 
+                                   struct ldlm_lock *new,
                                    void *data, int datalen)
 {
         struct ldlm_ast_work *w;
@@ -479,12 +483,18 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
                         CERROR("FL_CBPENDING set on non-local lock--just a "
                                "warning\n");
 
-                LDLM_DEBUG0(lock, "final decref done on cbpending lock");
+                LDLM_DEBUG(lock, "final decref done on cbpending lock");
+
+                if (lock->l_blocking_ast == NULL) {
+                        /* The lock wasn't even fully formed; just destroy it */
+                        ldlm_lock_destroy(lock);
+                }
                 l_unlock(&ns->ns_lock);
 
                 /* FIXME: need a real 'desc' here */
-                lock->l_blocking_ast(lock, NULL, lock->l_data,
-                                     LDLM_CB_BLOCKING);
+                if (lock->l_blocking_ast != NULL)
+                        lock->l_blocking_ast(lock, NULL, lock->l_data,
+                                             LDLM_CB_BLOCKING);
         } else if (ns->ns_client && !lock->l_readers && !lock->l_writers) {
                 /* If this is a client-side namespace and this was the last
                  * reference, put it on the LRU. */
@@ -710,7 +720,7 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags,
                         lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC, NULL);
         }
         if (rc)
-                LDLM_DEBUG0(lock, "matched");
+                LDLM_DEBUG(lock, "matched");
         else
                 LDLM_DEBUG_NOLOCK("not matched");
 
@@ -866,10 +876,10 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
         ldlm_grant_lock(lock, NULL, 0);
         EXIT;
       out:
-        l_unlock(&ns->ns_lock);
         /* Don't set 'completion_ast' until here so that if the lock is granted
          * immediately we don't do an unnecessary completion call. */
         lock->l_completion_ast = completion;
+        l_unlock(&ns->ns_lock);
         return ELDLM_OK;
 }
 
@@ -906,13 +916,27 @@ int ldlm_run_ast_work(struct list_head *rpc_list)
                 struct ldlm_ast_work *w =
                         list_entry(tmp, struct ldlm_ast_work, w_list);
 
-                if (w->w_blocking)
+                /* It's possible to receive a completion AST before we've set
+                 * the l_completion_ast pointer: either because the AST arrived
+                 * before the reply, or simply because there's a small race
+                 * window between receiving the reply and finishing the local
+                 * enqueue. (bug 842)
+                 *
+                 * This can't happen with the blocking_ast, however, because we
+                 * will never call the local blocking_ast until we drop our
+                 * reader/writer reference, which we won't do until we get the
+                 * reply and finish enqueueing. */
+                if (w->w_blocking) {
+                        LASSERT(w->w_lock->l_blocking_ast != NULL);
                         rc = w->w_lock->l_blocking_ast
                                 (w->w_lock, &w->w_desc, w->w_data,
                                  LDLM_CB_BLOCKING);
-                else
+                } else if (w->w_lock->l_completion_ast != NULL) {
                         rc = w->w_lock->l_completion_ast(w->w_lock, w->w_flags,
                                                          w->w_data);
+                } else {
+                        rc = 0;
+                }
                 if (rc == -ERESTART)
                         retval = rc;
                 else if (rc)
@@ -974,7 +998,7 @@ void ldlm_cancel_callback(struct ldlm_lock *lock)
                         lock->l_blocking_ast(lock, NULL, lock->l_data,
                                              LDLM_CB_CANCELING);
                 else
-                        LDLM_DEBUG0(lock, "no blocking ast");
+                        LDLM_DEBUG(lock, "no blocking ast");
         }
         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
 }
@@ -994,7 +1018,7 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
         /* Please do not, no matter how tempting, remove this LBUG without
          * talking to me first. -phik */
         if (lock->l_readers || lock->l_writers) {
-                LDLM_DEBUG0(lock, "lock still has references");
+                LDLM_DEBUG(lock, "lock still has references");
                 ldlm_lock_dump(D_OTHER, lock);
                 LBUG();
         }
@@ -1113,8 +1137,9 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock)
         CDEBUG(level, "  -- Lock dump: %p (%s) (rc: %d)\n", lock, ver,
                atomic_read(&lock->l_refc));
         if (lock->l_export && lock->l_export->exp_connection)
-                CDEBUG(level, "  Node: NID %x (rhandle: "LPX64")\n",
+                CDEBUG(level, "  Node: NID "LPX64" on %s (rhandle: "LPX64")\n",
                        lock->l_export->exp_connection->c_peer.peer_nid,
+                       lock->l_export->exp_connection->c_peer.peer_ni->pni_name,
                        lock->l_remote_handle.cookie);
         else
                 CDEBUG(level, "  Node: local\n");
index 61bb91e..c1d3182 100644 (file)
 #define EXPORT_SYMTAB
 #define DEBUG_SUBSYSTEM S_LDLM
 
+#ifdef __KERNEL__
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/lustre_dlm.h>
 #include <linux/init.h>
+#else 
+#include <liblustre.h>
+#endif
+
+#include <linux/lustre_dlm.h>
 #include <linux/obd_class.h>
 
+
 extern kmem_cache_t *ldlm_resource_slab;
 extern kmem_cache_t *ldlm_lock_slab;
 extern struct lustre_lock ldlm_handle_lock;
@@ -130,6 +136,12 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
         RETURN(1);
 }
 
+static inline void ldlm_failed_ast(struct ldlm_lock *lock)
+{
+        /* XXX diagnostic */
+        recovd_conn_fail(lock->l_export->exp_connection);
+}
+
 int ldlm_server_blocking_ast(struct ldlm_lock *lock,
                              struct ldlm_lock_desc *desc,
                              void *data, int flag)
@@ -170,7 +182,7 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
                sizeof(body->lock_handle1));
         memcpy(&body->lock_desc, desc, sizeof(*desc));
 
-        LDLM_DEBUG0(lock, "server preparing blocking AST");
+        LDLM_DEBUG(lock, "server preparing blocking AST");
         req->rq_replen = lustre_msg_size(0, NULL);
 
         ldlm_add_waiting_lock(lock);
@@ -180,7 +192,7 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
         rc = ptlrpc_queue_wait(req);
         if (rc == -ETIMEDOUT || rc == -EINTR) {
                 ldlm_del_waiting_lock(lock);
-                ldlm_expired_completion_wait(lock);
+                ldlm_failed_ast(lock);
         } else if (rc) {
                 CERROR("client returned %d from blocking AST for lock %p\n",
                        req->rq_status, lock);
@@ -220,14 +232,14 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
         body->lock_flags = flags;
         ldlm_lock2desc(lock, &body->lock_desc);
 
-        LDLM_DEBUG0(lock, "server preparing completion AST");
+        LDLM_DEBUG(lock, "server preparing completion AST");
         req->rq_replen = lustre_msg_size(0, NULL);
 
         req->rq_level = LUSTRE_CONN_RECOVD;
         rc = ptlrpc_queue_wait(req);
         if (rc == -ETIMEDOUT || rc == -EINTR) {
                 ldlm_del_waiting_lock(lock);
-                ldlm_expired_completion_wait(lock);
+                ldlm_failed_ast(lock);
         } else if (rc) {
                 CERROR("client returned %d from completion AST for lock %p\n",
                        req->rq_status, lock);
@@ -291,7 +303,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req,
 
         memcpy(&lock->l_remote_handle, &dlm_req->lock_handle1,
                sizeof(lock->l_remote_handle));
-        LDLM_DEBUG0(lock, "server-side enqueue handler, new lock created");
+        LDLM_DEBUG(lock, "server-side enqueue handler, new lock created");
 
         LASSERT(req->rq_export);
         lock->l_export = req->rq_export;
@@ -358,7 +370,7 @@ int ldlm_handle_convert(struct ptlrpc_request *req)
         if (!lock) {
                 req->rq_status = EINVAL;
         } else {
-                LDLM_DEBUG0(lock, "server-side convert handler START");
+                LDLM_DEBUG(lock, "server-side convert handler START");
                 ldlm_lock_convert(lock, dlm_req->lock_desc.l_req_mode,
                                   &dlm_rep->lock_flags);
                 if (ldlm_del_waiting_lock(lock))
@@ -368,7 +380,7 @@ int ldlm_handle_convert(struct ptlrpc_request *req)
 
         if (lock) {
                 ldlm_reprocess_all(lock->l_resource);
-                LDLM_DEBUG0(lock, "server-side convert handler END");
+                LDLM_DEBUG(lock, "server-side convert handler END");
                 LDLM_LOCK_PUT(lock);
         } else
                 LDLM_DEBUG_NOLOCK("server-side convert handler END");
@@ -403,7 +415,7 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
                                   dlm_req->lock_handle1.cookie);
                 req->rq_status = ESTALE;
         } else {
-                LDLM_DEBUG0(lock, "server-side cancel handler START");
+                LDLM_DEBUG(lock, "server-side cancel handler START");
                 ldlm_lock_cancel(lock);
                 if (ldlm_del_waiting_lock(lock))
                         CDEBUG(D_DLMTRACE, "cancelled waiting lock %p\n", lock);
@@ -415,7 +427,7 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
 
         if (lock) {
                 ldlm_reprocess_all(lock->l_resource);
-                LDLM_DEBUG0(lock, "server-side cancel handler END");
+                LDLM_DEBUG(lock, "server-side cancel handler END");
                 LDLM_LOCK_PUT(lock);
         }
 
@@ -439,30 +451,30 @@ static int ldlm_handle_bl_callback(struct ptlrpc_request *req,
 
         lock = ldlm_handle2lock_ns(ns, &dlm_req->lock_handle1);
         if (!lock) {
-                CERROR("blocking callback on lock "LPX64" - lock disappeared\n",
-                       dlm_req->lock_handle1.cookie);
+                CDEBUG(D_INFO, "blocking callback on lock "LPX64
+                       " - lock disappeared\n", dlm_req->lock_handle1.cookie);
                 RETURN(-EINVAL);
         }
 
-        LDLM_DEBUG0(lock, "client blocking AST callback handler START");
+        LDLM_DEBUG(lock, "client blocking AST callback handler START");
 
-        l_lock(&lock->l_resource->lr_namespace->ns_lock);
+        l_lock(&ns->ns_lock);
         lock->l_flags |= LDLM_FL_CBPENDING;
         do_ast = (!lock->l_readers && !lock->l_writers);
-        l_unlock(&lock->l_resource->lr_namespace->ns_lock);
+        l_unlock(&ns->ns_lock);
 
         if (do_ast) {
                 LDLM_DEBUG(lock, "already unused, calling "
                            "callback (%p)", lock->l_blocking_ast);
-                if (lock->l_blocking_ast != NULL) {
+                if (lock->l_blocking_ast != NULL)
                         lock->l_blocking_ast(lock, &dlm_req->lock_desc,
                                              lock->l_data, LDLM_CB_BLOCKING);
-                }
-        } else
-                LDLM_DEBUG0(lock, "Lock still has references, will be"
-                            " cancelled later");
+        } else {
+                LDLM_DEBUG(lock, "Lock still has references, will be"
+                           " cancelled later");
+        }
 
-        LDLM_DEBUG0(lock, "client blocking callback handler END");
+        LDLM_DEBUG(lock, "client blocking callback handler END");
         LDLM_LOCK_PUT(lock);
         RETURN(0);
 }
@@ -486,7 +498,7 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
                 RETURN(-EINVAL);
         }
 
-        LDLM_DEBUG0(lock, "client completion callback handler START");
+        LDLM_DEBUG(lock, "client completion callback handler START");
 
         l_lock(&ns->ns_lock);
 
@@ -494,7 +506,7 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
          * then we might need to switch lock modes, resources, or extents. */
         if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) {
                 lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
-                LDLM_DEBUG0(lock, "completion AST, new lock mode");
+                LDLM_DEBUG(lock, "completion AST, new lock mode");
         }
         if (lock->l_resource->lr_type == LDLM_EXTENT)
                 memcpy(&lock->l_extent, &dlm_req->lock_desc.l_extent,
@@ -505,13 +517,13 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
                    sizeof(lock->l_resource->lr_name)) != 0) {
                 ldlm_lock_change_resource(ns, lock,
                                          dlm_req->lock_desc.l_resource.lr_name);
-                LDLM_DEBUG0(lock, "completion AST, new resource");
+                LDLM_DEBUG(lock, "completion AST, new resource");
         }
         lock->l_resource->lr_tmp = &ast_list;
         ldlm_grant_lock(lock, req, sizeof(*req));
         lock->l_resource->lr_tmp = NULL;
         l_unlock(&ns->ns_lock);
-        LDLM_DEBUG0(lock, "callback handler finished, about to run_ast_work");
+        LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
         LDLM_LOCK_PUT(lock);
 
         ldlm_run_ast_work(&ast_list);
@@ -544,7 +556,8 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
                 CERROR("--> lock addr: "LPX64", cookie: "LPX64"\n",
                        dlm_req->lock_handle1.addr,dlm_req->lock_handle1.cookie);
-                RETURN(-ENOTCONN);
+                rc = -ENOTCONN;
+                goto out;
         }
 
         LASSERT(req->rq_export != NULL);
@@ -567,17 +580,12 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 CERROR("invalid opcode %d\n", req->rq_reqmsg->opc);
                 RETURN(-EINVAL);
         }
-
+ out:
         req->rq_status = rc;
-        if (rc) {
-                ptlrpc_error(req->rq_svc, req);
-        } else {
-                rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen,
-                                     &req->rq_repmsg);
-                if (rc)
-                        RETURN(rc);
-                ptlrpc_reply(req->rq_svc, req);
-        }
+        rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg);
+        if (rc)
+                RETURN(rc);
+        ptlrpc_reply(req->rq_svc, req);
 
         RETURN(0);
 }
@@ -672,7 +680,6 @@ static int ldlm_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
 static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
 {
         struct ldlm_obd *ldlm = &obddev->u.ldlm;
-        struct obd_uuid uuid = {"self"};
         int rc, i;
         ENTRY;
 
@@ -683,10 +690,11 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
         if (rc != 0)
                 RETURN(rc);
 
+#ifdef __KERNEL__
         ldlm->ldlm_cb_service =
                 ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE,
                                 LDLM_MAXREQSIZE, LDLM_CB_REQUEST_PORTAL,
-                                LDLM_CB_REPLY_PORTAL, &uuid,
+                                LDLM_CB_REPLY_PORTAL,
                                 ldlm_callback_handler, "ldlm_cbd");
 
         if (!ldlm->ldlm_cb_service) {
@@ -697,7 +705,7 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
         ldlm->ldlm_cancel_service =
                 ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE,
                                 LDLM_MAXREQSIZE, LDLM_CANCEL_REQUEST_PORTAL,
-                                LDLM_CANCEL_REPLY_PORTAL, &uuid,
+                                LDLM_CANCEL_REPLY_PORTAL,
                                 ldlm_cancel_handler, "ldlm_canceld");
 
         if (!ldlm->ldlm_cancel_service) {
@@ -728,6 +736,7 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
                 }
         }
 
+#endif
         INIT_LIST_HEAD(&waiting_locks_list);
         spin_lock_init(&waiting_locks_spinlock);
         waiting_locks_timer.function = waiting_locks_callback;
@@ -739,11 +748,12 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
         RETURN(0);
 
  out_thread:
+#ifdef __KERNEL__
         ptlrpc_stop_all_threads(ldlm->ldlm_cancel_service);
         ptlrpc_unregister_service(ldlm->ldlm_cancel_service);
         ptlrpc_stop_all_threads(ldlm->ldlm_cb_service);
         ptlrpc_unregister_service(ldlm->ldlm_cb_service);
-
+#endif
  out_proc:
         ldlm_proc_cleanup(obddev);
 
@@ -760,12 +770,13 @@ static int ldlm_cleanup(struct obd_device *obddev)
                 RETURN(-EBUSY);
         }
 
+#ifdef __KERNEL__
         ptlrpc_stop_all_threads(ldlm->ldlm_cb_service);
         ptlrpc_unregister_service(ldlm->ldlm_cb_service);
         ptlrpc_stop_all_threads(ldlm->ldlm_cancel_service);
         ptlrpc_unregister_service(ldlm->ldlm_cancel_service);
         ldlm_proc_cleanup(obddev);
-
+#endif
         ldlm_already_setup = 0;
         RETURN(0);
 }
@@ -786,7 +797,7 @@ struct obd_ops ldlm_obd_ops = {
         o_disconnect:  class_disconnect
 };
 
-static int __init ldlm_init(void)
+int __init ldlm_init(void)
 {
         int rc = class_register_type(&ldlm_obd_ops, 0, OBD_LDLM_DEVICENAME);
         if (rc != 0)
@@ -880,9 +891,11 @@ EXPORT_SYMBOL(ldlm_namespace_dump);
 EXPORT_SYMBOL(l_lock);
 EXPORT_SYMBOL(l_unlock);
 
+#ifdef __KERNEL__
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Lustre Lock Management Module v0.1");
 MODULE_LICENSE("GPL");
 
 module_init(ldlm_init);
 module_exit(ldlm_exit);
+#endif
index a4d91e9..d64a402 100644 (file)
  */
 
 #define DEBUG_SUBSYSTEM S_LDLM
+#ifndef __KERNEL__
+#include <signal.h>
+#include <liblustre.h>
+#endif
 
 #include <linux/lustre_dlm.h>
 #include <linux/obd_class.h>
@@ -45,7 +49,7 @@ int ldlm_expired_completion_wait(void *data)
         else if (!(conn = obd->u.cli.cl_import.imp_connection))
                 CERROR("lock %p has NULL connection\n", lock);
         else {
-                LDLM_DEBUG0(lock, "timed out waiting for completion");
+                LDLM_DEBUG(lock, "timed out waiting for completion");
                 CERROR("lock %p timed out from %s\n", lock,
                        conn->c_remote_uuid.uuid);
                 ldlm_lock_dump(D_ERROR, lock);
@@ -74,8 +78,8 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data)
                        LDLM_FL_BLOCK_CONV)))
                 RETURN(0);
 
-        LDLM_DEBUG0(lock, "client-side enqueue returned a blocked lock, "
-                    "sleeping");
+        LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+                   "sleeping");
         ldlm_lock_dump(D_OTHER, lock);
         ldlm_reprocess_all(lock->l_resource);
 
@@ -86,7 +90,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data)
                            lock->l_destroyed), &lwi);
 
         if (lock->l_destroyed) {
-                LDLM_DEBUG0(lock, "client-side enqueue waking up: destroyed");
+                LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed");
                 RETURN(-EIO);
         }
 
@@ -96,7 +100,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data)
                 RETURN(rc);
         }
 
-        LDLM_DEBUG0(lock, "client-side enqueue waking up: granted");
+        LDLM_DEBUG(lock, "client-side enqueue waking up: granted");
         RETURN(0);
 }
 
@@ -126,8 +130,7 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
                                 data, cp_data);
         if (!lock)
                 GOTO(out_nolock, err = -ENOMEM);
-        LDLM_DEBUG0(lock,
-                    "client-side local enqueue handler, new lock created");
+        LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
 
         ldlm_lock_addref_internal(lock, mode);
         ldlm_lock2handle(lock, lockh);
@@ -149,7 +152,7 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
         if (lock->l_completion_ast)
                 lock->l_completion_ast(lock, *flags, NULL);
 
-        LDLM_DEBUG0(lock, "client-side local enqueue END");
+        LDLM_DEBUG(lock, "client-side local enqueue END");
         EXIT;
  out:
         LDLM_LOCK_PUT(lock);
@@ -193,7 +196,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
          * If we're creating a new lock, get everything all setup nice. */
         if (is_replay) {
                 lock = ldlm_handle2lock(lockh);
-                LDLM_DEBUG0(lock, "client-side enqueue START");
+                LDLM_DEBUG(lock, "client-side enqueue START");
                 LASSERT(connh == lock->l_connh);
         } else {
                 lock = ldlm_lock_create(ns, parent_lock_handle, res_id, type,
@@ -205,7 +208,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                  * (in just this one case) to run the completion_cb even if it
                  * arrives before the reply. */
                 lock->l_completion_ast = completion;
-                LDLM_DEBUG0(lock, "client-side enqueue START");
+                LDLM_DEBUG(lock, "client-side enqueue START");
                 /* for the local lock, add the reference */
                 ldlm_lock_addref_internal(lock, mode);
                 ldlm_lock2handle(lock, lockh);
@@ -241,7 +244,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
         lock->l_connh = connh;
         lock->l_export = NULL;
 
-        LDLM_DEBUG0(lock, "sending request");
+        LDLM_DEBUG(lock, "sending request");
         rc = ptlrpc_queue_wait(req);
 
         if (rc != ELDLM_OK) {
@@ -253,8 +256,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                 lock->l_flags |= LDLM_FL_CANCELING;
                 l_unlock(&ns->ns_lock);
 
-                ldlm_lock_decref(lockh, mode);
-                ldlm_lock_destroy(lock);
+                ldlm_lock_decref_and_cancel(lockh, mode);
                 GOTO(out_req, rc);
         }
 
@@ -300,7 +302,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                                 LBUG();
                                 GOTO(out_req, rc = -ENOMEM);
                         }
-                        LDLM_DEBUG0(lock, "client-side enqueue, new resource");
+                        LDLM_DEBUG(lock, "client-side enqueue, new resource");
                 }
         }
 
@@ -314,7 +316,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                         lock->l_completion_ast(lock, *flags, NULL);
         }
 
-        LDLM_DEBUG0(lock, "client-side enqueue END");
+        LDLM_DEBUG(lock, "client-side enqueue END");
         EXIT;
  out_req:
         if (!req_passed_in)
@@ -382,12 +384,12 @@ static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
                 CERROR("Trying to cancel local lock\n");
                 LBUG();
         }
-        LDLM_DEBUG0(lock, "client-side local convert");
+        LDLM_DEBUG(lock, "client-side local convert");
 
         ldlm_lock_convert(lock, new_mode, flags);
         ldlm_reprocess_all(lock->l_resource);
 
-        LDLM_DEBUG0(lock, "client-side local convert handler END");
+        LDLM_DEBUG(lock, "client-side local convert handler END");
         LDLM_LOCK_PUT(lock);
         RETURN(0);
 }
@@ -416,7 +418,7 @@ int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, int *flags)
         if (!connh)
                 RETURN(ldlm_cli_convert_local(lock, new_mode, flags));
 
-        LDLM_DEBUG0(lock, "client-side convert");
+        LDLM_DEBUG(lock, "client-side convert");
 
         req = ptlrpc_prep_req(class_conn2cliimp(connh), LDLM_CONVERT, 1, &size,
                               NULL);
@@ -468,7 +470,7 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
         if (lock->l_connh) {
                 int local_only;
 
-                LDLM_DEBUG0(lock, "client-side cancel");
+                LDLM_DEBUG(lock, "client-side cancel");
                 /* Set this flag to prevent others from getting new references*/
                 l_lock(&lock->l_resource->lr_namespace->ns_lock);
                 lock->l_flags |= LDLM_FL_CBPENDING;
@@ -509,14 +511,14 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
         local_cancel:
                 ldlm_lock_cancel(lock);
         } else {
-                LDLM_DEBUG0(lock, "client-side local cancel");
+                LDLM_DEBUG(lock, "client-side local cancel");
                 if (lock->l_resource->lr_namespace->ns_client) {
                         CERROR("Trying to cancel local lock\n");
                         LBUG();
                 }
                 ldlm_lock_cancel(lock);
                 ldlm_reprocess_all(lock->l_resource);
-                LDLM_DEBUG0(lock, "client-side local cancel handler END");
+                LDLM_DEBUG(lock, "client-side local cancel handler END");
         }
 
         EXIT;
@@ -613,6 +615,12 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
                 LASSERT(w);
 
                 w->w_lock = LDLM_LOCK_GET(lock);
+
+                /* Prevent the cancel callback from being called by setting
+                 * LDLM_FL_CANCEL in the lock.  Very sneaky. -p */
+                if (flags & LDLM_FL_NO_CALLBACK)
+                        w->w_lock->l_flags |= LDLM_FL_CANCEL;
+
                 list_add(&w->w_list, &list);
         }
         l_unlock(&ns->ns_lock);
@@ -622,11 +630,6 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
                 int rc;
                 w = list_entry(tmp, struct ldlm_ast_work, w_list);
 
-                /* Prevent the cancel callback from being called by setting
-                 * LDLM_FL_CANCEL in the lock.  Very sneaky. -p */
-                if (flags & LDLM_FL_NO_CALLBACK)
-                        w->w_lock->l_flags |= LDLM_FL_CANCEL;
-
                 if (flags & LDLM_FL_LOCAL_ONLY) {
                         ldlm_lock_cancel(w->w_lock);
                 } else {
@@ -645,11 +648,12 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
         RETURN(0);
 }
 
-/* Cancel all locks on a namespace (or a specific resource, if given) that have
- * 0 readers/writers.
+/* Cancel all locks on a namespace (or a specific resource, if given)
+ * that have 0 readers/writers.
  *
- * If 'local_only' is true, throw the locks away without trying to notify the
- * server. */
+ * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
+ * to notify the server.
+ * If flags & LDLM_FL_NO_CALLBACK, don't run the cancel callback. */
 int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
                            struct ldlm_res_id *res_id, int flags)
 {
@@ -831,7 +835,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
         size = sizeof(*reply);
         req->rq_replen = lustre_msg_size(1, &size);
 
-        LDLM_DEBUG0(lock, "replaying lock:");
+        LDLM_DEBUG(lock, "replaying lock:");
         rc = ptlrpc_queue_wait(req);
         if (rc != ELDLM_OK)
                 GOTO(out, rc);
@@ -839,7 +843,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
         reply = lustre_msg_buf(req->rq_repmsg, 0);
         memcpy(&lock->l_remote_handle, &reply->lock_handle,
                sizeof(lock->l_remote_handle));
-        LDLM_DEBUG0(lock, "replayed lock:");
+        LDLM_DEBUG(lock, "replayed lock:");
  out:
         ptlrpc_req_finished(req);
         RETURN(rc);
index f927307..0f9f4e2 100644 (file)
  */
 
 #define DEBUG_SUBSYSTEM S_LDLM
-
+#ifdef __KERNEL__
 #include <linux/lustre_dlm.h>
+#else
+#include <liblustre.h>
+#endif
+
 #include <linux/obd_class.h>
 
 kmem_cache_t *ldlm_resource_slab, *ldlm_lock_slab;
@@ -37,6 +41,7 @@ int ldlm_proc_setup(struct obd_device *obd)
         int rc;
         ENTRY;
         LASSERT(ldlm_ns_proc_dir == NULL);
+        LASSERT(obd != NULL);
         rc = lprocfs_obd_attach(obd, 0);
         if (rc) {
                 CERROR("LProcFS failed in ldlm-init\n");
@@ -54,6 +59,7 @@ void ldlm_proc_cleanup(struct obd_device *obd)
         }
 }
 
+#ifdef __KERNEL__
 static int lprocfs_uint_rd(char *page, char **start, off_t off,
                            int count, int *eof, void *data)
 {
@@ -61,12 +67,16 @@ static int lprocfs_uint_rd(char *page, char **start, off_t off,
         return snprintf(page, count, "%u\n", *temp);
 }
 
+
 #define MAX_STRING_SIZE 128
 void ldlm_proc_namespace(struct ldlm_namespace *ns)
 {
         struct lprocfs_vars lock_vars[2];
         char lock_name[MAX_STRING_SIZE + 1];
 
+        LASSERT(ns != NULL);
+        LASSERT(ns->ns_name != NULL);
+
         lock_name[MAX_STRING_SIZE] = '\0';
 
         memset(lock_vars, 0, sizeof(lock_vars));
@@ -80,6 +90,7 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns)
         lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
 
         snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_count", ns->ns_name);
+
         lock_vars[0].data = &ns->ns_locks;
         lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
 
@@ -89,6 +100,7 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns)
         lock_vars[0].read_fptr = lprocfs_uint_rd;
         lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
 }
+#endif
 #undef MAX_STRING_SIZE
 
 #define LDLM_MAX_UNUSED 20
@@ -133,11 +145,13 @@ struct ldlm_namespace *ldlm_namespace_new(char *name, __u32 client)
         spin_lock(&ldlm_namespace_lock);
         list_add(&ns->ns_list_chain, &ldlm_namespace_list);
         spin_unlock(&ldlm_namespace_lock);
+#ifdef __KERNEL__
         ldlm_proc_namespace(ns);
+#endif
         RETURN(ns);
 
 out_hash:
-        memset(ns->ns_hash, 0x5a, sizeof(*ns->ns_hash) * RES_HASH_SIZE);
+        POISON(ns->ns_hash, 0x5a, sizeof(*ns->ns_hash) * RES_HASH_SIZE);
         vfree(ns->ns_hash);
         atomic_sub(sizeof(*ns->ns_hash) * RES_HASH_SIZE, &obd_memory);
 out_ns:
@@ -193,8 +207,8 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
                         if (local_only || rc != ELDLM_OK)
                                 ldlm_lock_cancel(lock);
                 } else {
-                        LDLM_DEBUG0(lock, "Freeing a lock still held by a "
-                                    "client node");
+                        LDLM_DEBUG(lock, "Freeing a lock still held by a "
+                                   "client node");
 
                         ldlm_resource_unlink_lock(lock);
                         ldlm_lock_destroy(lock);
@@ -257,7 +271,7 @@ int ldlm_namespace_free(struct ldlm_namespace *ns)
 
         ldlm_namespace_cleanup(ns, 0);
 
-        memset(ns->ns_hash, 0x5a, sizeof(*ns->ns_hash) * RES_HASH_SIZE);
+        POISON(ns->ns_hash, 0x5a, sizeof(*ns->ns_hash) * RES_HASH_SIZE);
         vfree(ns->ns_hash /* , sizeof(*ns->ns_hash) * RES_HASH_SIZE */);
         atomic_sub(sizeof(*ns->ns_hash) * RES_HASH_SIZE, &obd_memory);
         OBD_FREE(ns->ns_name, strlen(ns->ns_name) + 1);
@@ -447,7 +461,7 @@ int ldlm_resource_putref(struct ldlm_resource *res)
                 list_del_init(&res->lr_hash);
                 list_del_init(&res->lr_childof);
 
-                memset(res, 0x5a, sizeof(*res));
+                POISON(res, 0x5a, sizeof(*res));
                 kmem_cache_free(ldlm_resource_slab, res);
                 l_unlock(&ns->ns_lock);
 
index 122142b..ae490d9 100644 (file)
 #define EXPORT_SYMTAB
 #define DEBUG_SUBSYSTEM S_OST /* XXX WRONG */
 
+#ifdef __KERNEL__
 #include <linux/module.h>
+#else 
+#include <liblustre.h>
+#endif
+
+#include <linux/obd.h>
 #include <linux/obd_ost.h>
 #include <linux/lustre_net.h>
 #include <linux/lustre_dlm.h>
@@ -119,9 +125,137 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
         imp->imp_obd = obddev;
 
         cli->cl_max_mds_easize = sizeof(struct lov_mds_md);
+#if !defined(__KERNEL__) || (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        cli->cl_sandev = 0;
+#else
+        cli->cl_sandev.value = 0;
+#endif
+
+        RETURN(0);
+}
+
+#ifdef __KERNEL__
+/* convert a pathname into a kdev_t */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+static kdev_t path2dev(char *path)
+{
+        struct dentry *dentry;
+        struct nameidata nd;
+        kdev_t dev = 0;
+
+        if (!path_init(path, LOOKUP_FOLLOW, &nd))
+                return 0;
+
+        if (path_walk(path, &nd))
+                return 0;
+
+        dentry = nd.dentry;
+        if (dentry->d_inode && !is_bad_inode(dentry->d_inode) &&
+            S_ISBLK(dentry->d_inode->i_mode))
+                dev = dentry->d_inode->i_rdev;
+        path_release(&nd);
+
+        return dev;
+}
+#else
+static int path2dev(char *path)
+{
+        struct dentry *dentry;
+        struct nameidata nd;
+        int dev = 0;
+
+        if (!path_init(path, LOOKUP_FOLLOW, &nd))
+                return 0;
+
+        if (path_walk(path, &nd))
+                return 0;
+
+        dentry = nd.dentry;
+        if (dentry->d_inode && !is_bad_inode(dentry->d_inode) &&
+            S_ISBLK(dentry->d_inode->i_mode))
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+                dev = dentry->d_inode->i_rdev;
+#else
+                dev = dentry->d_inode->i_rdev.value;
+#endif
+        path_release(&nd);
+
+        return dev;
+}
+#endif
+
+int client_sanobd_setup(struct obd_device *obddev, obd_count len, void *buf)
+{
+        struct obd_ioctl_data* data = buf;
+        struct client_obd *cli = &obddev->u.cli;
+        struct obd_import *imp = &cli->cl_import;
+        struct obd_uuid server_uuid;
+        ENTRY;
+
+        if (data->ioc_inllen1 < 1) {
+                CERROR("requires a TARGET UUID\n");
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen1 > 37) {
+                CERROR("client UUID must be less than 38 characters\n");
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen2 < 1) {
+                CERROR("setup requires a SERVER UUID\n");
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen2 > 37) {
+                CERROR("target UUID must be less than 38 characters\n");
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen3 < 1) {
+                CERROR("setup requires a SAN device pathname\n");
+                RETURN(-EINVAL);
+        }
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        cli->cl_sandev = path2dev(data->ioc_inlbuf3);
+        if (!cli->cl_sandev) {
+                CERROR("%s seems not a valid SAN device\n", data->ioc_inlbuf3);
+                RETURN(-EINVAL);
+        }
+#else
+        cli->cl_sandev.value = path2dev(data->ioc_inlbuf3);
+        if (!cli->cl_sandev.value) {
+                CERROR("%s seems not a valid SAN device\n", data->ioc_inlbuf3);
+                RETURN(-EINVAL);
+        }
+#endif
+
+        sema_init(&cli->cl_sem, 1);
+        cli->cl_conn_count = 0;
+        memcpy(cli->cl_target_uuid.uuid, data->ioc_inlbuf1, data->ioc_inllen1);
+        memcpy(server_uuid.uuid, data->ioc_inlbuf2, MIN(data->ioc_inllen2,
+                                                   sizeof(server_uuid)));
+
+        imp->imp_connection = ptlrpc_uuid_to_connection(&server_uuid);
+        if (!imp->imp_connection)
+                RETURN(-ENOENT);
+        
+        INIT_LIST_HEAD(&imp->imp_replay_list);
+        INIT_LIST_HEAD(&imp->imp_sending_list);
+        INIT_LIST_HEAD(&imp->imp_delayed_list);
+        spin_lock_init(&imp->imp_lock);
+
+        ptlrpc_init_client(OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
+                           "sanosc", &obddev->obd_ldlm_client);
+        imp->imp_client = &obddev->obd_ldlm_client;
+        imp->imp_obd = obddev;
+
+        cli->cl_max_mds_easize = sizeof(struct lov_mds_md);
 
         RETURN(0);
 }
+#endif
 
 int client_obd_cleanup(struct obd_device * obddev)
 {
@@ -165,7 +299,6 @@ int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd,
                 GOTO(out_disco, rc = -ENOMEM);
 
         INIT_LIST_HEAD(&imp->imp_chain);
-        imp->imp_last_xid = 0;
         imp->imp_max_transno = 0;
         imp->imp_peer_committed_transno = 0;
 
@@ -203,20 +336,9 @@ out_req:
 out_ldlm:
                 ldlm_namespace_free(obd->obd_namespace);
                 obd->obd_namespace = NULL;
-                if (rq_opc == MDS_CONNECT) {
-                        /* Don't class_disconnect OSCs, because the LOV
-                         * cares about them even if they can't connect to the
-                         * OST.
-                         *
-                         * This is leak-bait, but without either a way to
-                         * operate on the osc without an export or separate
-                         * methods for connect-to-osc and connect-osc-to-ost
-                         * it's not clear what else to do.
-                         */
 out_disco:
-                        cli->cl_conn_count--;
-                        class_disconnect(conn);
-                }
+                cli->cl_conn_count--;
+                class_disconnect(conn);
         }
 out_sem:
         up(&cli->cl_sem);
index 4d7f37a..aa666ad 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
+#include <linux/version.h>
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #include <linux/locks.h>   // for wait_on_buffer
 #else
@@ -58,17 +59,28 @@ void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode)
         fid->f_type = HTON__u32(S_IFMT & inode->i_mode);
 }
 
-
 void mds_pack_inode2body(struct mds_body *b, struct inode *inode)
 {
         b->valid = OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME |
                 OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
                 OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
                 OBD_MD_FLNLINK | OBD_MD_FLGENER;
+
+        /* The MDS file size isn't authoritative for regular files, so don't
+         * even pretend. */
+        if (S_ISREG(inode->i_mode))
+                b->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+
         b->ino = HTON__u32(inode->i_ino);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         b->atime = HTON__u32(inode->i_atime);
         b->mtime = HTON__u32(inode->i_mtime);
         b->ctime = HTON__u32(inode->i_ctime);
+#else
+        b->atime = HTON__u32(inode->i_atime.tv_sec);
+        b->mtime = HTON__u32(inode->i_mtime.tv_sec);
+        b->ctime = HTON__u32(inode->i_ctime.tv_sec);
+#endif
         b->mode = HTON__u32(inode->i_mode);
         b->size = HTON__u64(inode->i_size);
         b->blocks = HTON__u64(inode->i_blocks);
@@ -192,6 +204,10 @@ void mds_create_pack(struct ptlrpc_request *req, int offset, struct inode *dir,
         rec->cr_uid = HTON__u32(uid);
         rec->cr_gid = HTON__u32(gid);
         rec->cr_time = HTON__u64(time);
+        if (in_group_p(dir->i_gid))
+                rec->cr_suppgid = HTON__u32(dir->i_gid);
+        else
+                rec->cr_suppgid = HTON__u32(-1);
 
         tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1);
         LOGL0(name, namelen, tmp);
@@ -257,9 +273,15 @@ void mds_setattr_pack(struct ptlrpc_request *req,
                 rec->sa_uid = HTON__u32(iattr->ia_uid);
                 rec->sa_gid = HTON__u32(iattr->ia_gid);
                 rec->sa_size = HTON__u64(iattr->ia_size);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
                 rec->sa_atime = HTON__u64(iattr->ia_atime);
                 rec->sa_mtime = HTON__u64(iattr->ia_mtime);
                 rec->sa_ctime = HTON__u64(iattr->ia_ctime);
+#else
+                rec->sa_atime = HTON__u64(iattr->ia_atime.tv_sec);
+                rec->sa_mtime = HTON__u64(iattr->ia_mtime.tv_sec);
+                rec->sa_ctime = HTON__u64(iattr->ia_ctime.tv_sec);
+#endif
                 rec->sa_attr_flags = HTON__u32(iattr->ia_attr_flags);
 
                 if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid))
@@ -339,6 +361,14 @@ void mds_rename_pack(struct ptlrpc_request *req, int offset,
         rec->rn_fsuid = HTON__u32(current->fsuid);
         rec->rn_fsgid = HTON__u32(current->fsgid);
         rec->rn_cap = HTON__u32(current->cap_effective);
+        if (in_group_p(srcdir->i_gid))
+                rec->rn_suppgid1 = HTON__u32(srcdir->i_gid);
+        else
+                rec->rn_suppgid1 = HTON__u32(-1);
+        if (in_group_p(tgtdir->i_gid))
+                rec->rn_suppgid2 = HTON__u32(tgtdir->i_gid);
+        else
+                rec->rn_suppgid2 = HTON__u32(-1);
         ll_inode2fid(&rec->rn_fid1, srcdir);
         ll_inode2fid(&rec->rn_fid2, tgtdir);
 
@@ -400,16 +430,23 @@ static int mds_setattr_unpack(struct ptlrpc_request *req, int offset,
         r->ur_fsuid = NTOH__u32(rec->sa_fsuid);
         r->ur_fsgid = NTOH__u32(rec->sa_fsgid);
         r->ur_cap = NTOH__u32(rec->sa_cap);
-        r->ur_suppgid = NTOH__u32(rec->sa_suppgid);
+        r->ur_suppgid1 = NTOH__u32(rec->sa_suppgid);
+        r->ur_suppgid2 = NTOH__u32(-1);
         r->ur_fid1 = &rec->sa_fid;
         attr->ia_valid = NTOH__u32(rec->sa_valid);
         attr->ia_mode = NTOH__u32(rec->sa_mode);
         attr->ia_uid = NTOH__u32(rec->sa_uid);
         attr->ia_gid = NTOH__u32(rec->sa_gid);
         attr->ia_size = NTOH__u64(rec->sa_size);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         attr->ia_atime = NTOH__u64(rec->sa_atime);
         attr->ia_mtime = NTOH__u64(rec->sa_mtime);
         attr->ia_ctime = NTOH__u64(rec->sa_ctime);
+#else
+        attr->ia_atime.tv_sec = NTOH__u64(rec->sa_atime);
+        attr->ia_mtime.tv_sec = NTOH__u64(rec->sa_mtime);
+        attr->ia_ctime.tv_sec = NTOH__u64(rec->sa_ctime);
+#endif
         attr->ia_attr_flags = NTOH__u32(rec->sa_attr_flags);
 
         if (req->rq_reqmsg->bufcount == offset + 2) {
@@ -443,7 +480,8 @@ static int mds_create_unpack(struct ptlrpc_request *req, int offset,
         r->ur_gid = NTOH__u32(rec->cr_gid);
         r->ur_time = NTOH__u64(rec->cr_time);
         r->ur_flags = NTOH__u32(rec->cr_flags);
-        r->ur_suppgid = NTOH__u32(rec->cr_suppgid);
+        r->ur_suppgid1 = NTOH__u32(rec->cr_suppgid);
+        r->ur_suppgid2 = NTOH__u32(-1);
 
         r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
         r->ur_namelen = req->rq_reqmsg->buflens[offset + 1];
@@ -471,7 +509,8 @@ static int mds_link_unpack(struct ptlrpc_request *req, int offset,
         r->ur_fsuid = NTOH__u32(rec->lk_fsuid);
         r->ur_fsgid = NTOH__u32(rec->lk_fsgid);
         r->ur_cap = NTOH__u32(rec->lk_cap);
-        r->ur_suppgid = NTOH__u32(rec->lk_suppgid);
+        r->ur_suppgid1 = NTOH__u32(rec->lk_suppgid);
+        r->ur_suppgid2 = NTOH__u32(-1);
         r->ur_fid1 = &rec->lk_fid1;
         r->ur_fid2 = &rec->lk_fid2;
 
@@ -494,7 +533,8 @@ static int mds_unlink_unpack(struct ptlrpc_request *req, int offset,
         r->ur_fsgid = NTOH__u32(rec->ul_fsgid);
         r->ur_cap = NTOH__u32(rec->ul_cap);
         r->ur_mode = NTOH__u32(rec->ul_mode);
-        r->ur_suppgid = NTOH__u32(rec->ul_suppgid);
+        r->ur_suppgid1 = NTOH__u32(rec->ul_suppgid);
+        r->ur_suppgid2 = NTOH__u32(-1);
         r->ur_fid1 = &rec->ul_fid1;
         r->ur_fid2 = &rec->ul_fid2;
 
@@ -516,6 +556,8 @@ static int mds_rename_unpack(struct ptlrpc_request *req, int offset,
         r->ur_fsuid = NTOH__u32(rec->rn_fsuid);
         r->ur_fsgid = NTOH__u32(rec->rn_fsgid);
         r->ur_cap = NTOH__u32(rec->rn_cap);
+        r->ur_suppgid1 = NTOH__u32(rec->rn_suppgid1);
+        r->ur_suppgid2 = NTOH__u32(rec->rn_suppgid2);
         r->ur_fid1 = &rec->rn_fid1;
         r->ur_fid2 = &rec->rn_fid2;
 
index a03d2bb..c77b5b8 100644 (file)
@@ -23,6 +23,9 @@
  */
 
 #define DEBUG_SUBSYSTEM S_OST
+#ifndef __KERNEL__
+#include <liblustre.h>
+#endif
 
 #include <linux/obd_ost.h>
 #include <linux/lustre_net.h>
index f5627ba..c0d4f31 100644 (file)
@@ -55,10 +55,14 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx,
         OBD_SET_CTXT_MAGIC(save);
 
         /*
-        CDEBUG(D_INFO, "== push %p->%p == cur fs %p pwd %p (%*s), pwdmnt %p\n",
+        CDEBUG(D_INFO,
+               "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%*s), pwdmnt %p:%d\n",
                save, current, current->fs, current->fs->pwd,
+               atomic_read(&current->fs->pwd->d_count),
+               atomic_read(&current->fs->pwd->d_inode->i_count),
                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
-               current->fs->pwdmnt);
+               current->fs->pwdmnt,
+               atomic_read(&current->fs->pwdmnt->mnt_count));
         */
 
         save->fs = get_fs();
@@ -80,17 +84,23 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx,
                 current->fsuid = uc->ouc_fsuid;
                 current->fsgid = uc->ouc_fsgid;
                 current->cap_effective = uc->ouc_cap;
-                if (uc->ouc_suppgid != -1)
-                        current->groups[current->ngroups++] = uc->ouc_suppgid;
+                if (uc->ouc_suppgid1 != -1)
+                        current->groups[current->ngroups++] = uc->ouc_suppgid1;
+                if (uc->ouc_suppgid2 != -1)
+                        current->groups[current->ngroups++] = uc->ouc_suppgid2;
         }
         set_fs(new_ctx->fs);
         set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
 
         /*
-        CDEBUG(D_INFO, "== push %p==%p == cur fs %p pwd %p (%*s), pwdmnt %p\n",
+        CDEBUG(D_INFO,
+               "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%*s), pwdmnt %p:%d\n",
                new_ctx, current, current->fs, current->fs->pwd,
+               atomic_read(&current->fs->pwd->d_count),
+               atomic_read(&current->fs->pwd->d_inode->i_count),
                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
-               current->fs->pwdmnt);
+               current->fs->pwdmnt,
+               atomic_read(&current->fs->pwdmnt->mnt_count));
         */
 }
 
@@ -103,39 +113,44 @@ void pop_ctxt(struct obd_run_ctxt *saved, struct obd_run_ctxt *new_ctx,
         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
 
         /*
-        CDEBUG(D_INFO, " == pop  %p==%p == cur %p pwd %p (%*s), pwdmnt %p\n",
+        CDEBUG(D_INFO,
+               " = pop  %p==%p = cur %p pwd %p:d%d:i%d (%*s), pwdmnt %p:%d\n",
                new_ctx, current, current->fs, current->fs->pwd,
+               atomic_read(&current->fs->pwd->d_count),
+               atomic_read(&current->fs->pwd->d_inode->i_count),
                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
-               current->fs->pwdmnt);
+               current->fs->pwdmnt,
+               atomic_read(&current->fs->pwdmnt->mnt_count));
         */
 
         LASSERT(current->fs->pwd == new_ctx->pwd);
         LASSERT(current->fs->pwdmnt == new_ctx->pwdmnt);
 
-        //printk("pc2");
         set_fs(saved->fs);
-        //printk("pc3\n");
         set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
-        //printk("pc4");
 
         dput(saved->pwd);
-        //printk("pc5");
         mntput(saved->pwdmnt);
-        //printk("pc6\n");
         if (uc) {
                 current->fsuid = saved->fsuid;
                 current->fsgid = saved->fsgid;
                 current->cap_effective = saved->cap;
 
-                if (uc->ouc_suppgid != -1)
+                if (uc->ouc_suppgid1 != -1)
+                        current->ngroups--;
+                if (uc->ouc_suppgid2 != -1)
                         current->ngroups--;
         }
 
         /*
-        CDEBUG(D_INFO, "== pop  %p->%p == cur fs %p pwd %p (%*s), pwdmnt %p\n",
+        CDEBUG(D_INFO,
+               "= pop  %p->%p = cur fs %p pwd %p:d%d:i%d (%*s), pwdmnt %p:%d\n",
                saved, current, current->fs, current->fs->pwd,
+               atomic_read(&current->fs->pwd->d_count),
+               atomic_read(&current->fs->pwd->d_inode->i_count),
                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
-               current->fs->pwdmnt);
+               current->fs->pwdmnt,
+               atomic_read(&current->fs->pwdmnt->mnt_count));
         */
 }
 
index 81638f1..590ae4b 100644 (file)
@@ -69,7 +69,8 @@ int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
         RETURN(0);
 }
 
-int target_handle_connect(struct ptlrpc_request *req)
+
+int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
 {
         struct obd_device *target;
         struct obd_export *export = NULL;
@@ -103,6 +104,11 @@ int target_handle_connect(struct ptlrpc_request *req)
         if (!target)
                 GOTO(out, rc = -ENODEV);
 
+        spin_lock_bh(&target->obd_processing_task_lock);
+        if (target->obd_flags & OBD_ABORT_RECOVERY)
+                target_abort_recovery(target);
+        spin_unlock_bh(&target->obd_processing_task_lock);
+
         conn.addr = req->rq_reqmsg->addr;
         conn.cookie = req->rq_reqmsg->cookie;
 
@@ -132,8 +138,11 @@ int target_handle_connect(struct ptlrpc_request *req)
                 spin_unlock(&target->obd_dev_lock);
 
         /* Tell the client if we're in recovery. */
-        if (target->obd_flags & OBD_RECOVERING)
+        /* If this is the first client, start the recovery timer */
+        if (target->obd_flags & OBD_RECOVERING) {
                 lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING);
+                target_start_recovery_timer(target, handler);
+        }
 
         /* Tell the client if we support replayable requests */
         if (target->obd_flags & OBD_REPLAYABLE)
@@ -151,16 +160,11 @@ int target_handle_connect(struct ptlrpc_request *req)
                 }
         }
 
-        if (rc == EALREADY) {
-                /* We indicate the reconnection in a flag, not an error code. */
-                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
-                rc = 0;
-        } else if (rc) {
-                GOTO(out, rc);
-        }
-
         /* If all else goes well, this is our RPC return code. */
-        req->rq_status = rc;
+        req->rq_status = 0;
+
+        if (rc && rc != EALREADY)
+                GOTO(out, rc);
 
         req->rq_repmsg->addr = conn.addr;
         req->rq_repmsg->cookie = conn.cookie;
@@ -174,6 +178,12 @@ int target_handle_connect(struct ptlrpc_request *req)
                 ptlrpc_put_connection(req->rq_connection);
         req->rq_connection = ptlrpc_connection_addref(export->exp_connection);
 
+        if (rc == EALREADY) {
+                /* We indicate the reconnection in a flag, not an error code. */
+                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
+                GOTO(out, rc = 0);
+        }
+
         spin_lock(&export->exp_connection->c_lock);
         list_add(&export->exp_conn_chain, &export->exp_connection->c_exports);
         spin_unlock(&export->exp_connection->c_lock);
@@ -224,6 +234,8 @@ static int target_disconnect_client(struct ptlrpc_connection *conn)
         list_for_each_safe(expiter, n, &conn->c_exports) {
                 exp = list_entry(expiter, struct obd_export, exp_conn_chain);
 
+                CDEBUG(D_HA, "disconnecting export %p/%s\n",
+                       exp, exp->exp_client_uuid.uuid);
                 hdl.addr = (__u64)(unsigned long)exp;
                 hdl.cookie = exp->exp_cookie;
                 rc = obd_disconnect(&hdl);
@@ -264,3 +276,248 @@ int target_revoke_connection(struct recovd_data *rd, int phase)
         LBUG();
         RETURN(-ENOSYS);
 }
+
+/*
+ * Recovery functions 
+ */
+
+static void abort_delayed_replies(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+        struct list_head *tmp, *n;
+        list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                DEBUG_REQ(D_ERROR, req, "aborted:");
+                req->rq_status = -ENOTCONN;
+                req->rq_type = PTL_RPC_MSG_ERR;
+                ptlrpc_reply(req->rq_svc, req);
+                list_del(&req->rq_list);
+                OBD_FREE(req, sizeof *req);
+        }
+}
+
+void target_abort_recovery(void *data)
+{
+        struct obd_device *obd = data;
+        CERROR("disconnecting clients and aborting recovery\n");
+        obd->obd_recoverable_clients = 0;
+        obd->obd_flags &= ~(OBD_RECOVERING | OBD_ABORT_RECOVERY);
+        abort_delayed_replies(obd);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+        class_disconnect_all(obd);
+        spin_lock_bh(&obd->obd_processing_task_lock);
+}
+
+static void target_recovery_expired(unsigned long castmeharder)
+{
+        struct obd_device *obd = (struct obd_device *)castmeharder;
+        CERROR("recovery timed out, aborting\n");
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        obd->obd_flags |= OBD_ABORT_RECOVERY;
+        wake_up(&obd->obd_next_transno_waitq);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+}
+
+static void reset_recovery_timer(struct obd_device *obd)
+{
+        CDEBUG(D_ERROR, "timer will expire in %ld seconds\n",
+               OBD_RECOVERY_TIMEOUT / HZ);
+        mod_timer(&obd->obd_recovery_timer, jiffies + OBD_RECOVERY_TIMEOUT);
+}
+
+
+/* Only start it the first time called */
+void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler)
+{
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        if (obd->obd_recovery_handler) {
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                return;
+        }
+        CERROR("%s: starting recovery timer\n", obd->obd_name);
+        obd->obd_recovery_handler = handler;
+        obd->obd_recovery_timer.function = target_recovery_expired;
+        obd->obd_recovery_timer.data = (unsigned long)obd;
+        init_timer(&obd->obd_recovery_timer);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+        reset_recovery_timer(obd);
+}
+
+static void cancel_recovery_timer(struct obd_device *obd)
+{
+        del_timer(&obd->obd_recovery_timer);
+}
+
+static int check_for_next_transno(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+        req = list_entry(obd->obd_recovery_queue.next,
+                         struct ptlrpc_request, rq_list);
+        LASSERT(req->rq_reqmsg->transno >= obd->obd_next_recovery_transno);
+
+        return req->rq_reqmsg->transno == obd->obd_next_recovery_transno ||
+                (obd->obd_flags & OBD_RECOVERING) == 0;
+}
+
+static void process_recovery_queue(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+        int aborted = 0;
+        ENTRY;
+
+        for (;;) {
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                LASSERT(obd->obd_processing_task == current->pid);
+                req = list_entry(obd->obd_recovery_queue.next,
+                                 struct ptlrpc_request, rq_list);
+
+                if (req->rq_reqmsg->transno != obd->obd_next_recovery_transno) {
+                        spin_unlock_bh(&obd->obd_processing_task_lock);
+                        CDEBUG(D_HA, "Waiting for transno "LPD64" (1st is "
+                               LPD64")\n",
+                               obd->obd_next_recovery_transno,
+                               req->rq_reqmsg->transno);
+                        wait_event(obd->obd_next_transno_waitq,
+                                   check_for_next_transno(obd));
+                        spin_lock_bh(&obd->obd_processing_task_lock);
+                        if (obd->obd_flags & OBD_ABORT_RECOVERY) {
+                                target_abort_recovery(obd);
+                                aborted = 1;
+                        }
+                        spin_unlock_bh(&obd->obd_processing_task_lock);
+                        if (aborted)
+                                return;
+                        continue;
+                }
+                list_del_init(&req->rq_list);
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+
+                DEBUG_REQ(D_ERROR, req, "processing: ");
+                (void)obd->obd_recovery_handler(req);
+                reset_recovery_timer(obd);
+#warning FIXME: mds_fsync_super(mds->mds_sb);
+                OBD_FREE(req, sizeof *req);
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                obd->obd_next_recovery_transno++;
+                if (list_empty(&obd->obd_recovery_queue)) {
+                        obd->obd_processing_task = 0;
+                        spin_unlock_bh(&obd->obd_processing_task_lock);
+                        break;
+                }
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+        }
+        EXIT;
+}
+
+int target_queue_recovery_request(struct ptlrpc_request *req,
+                                  struct obd_device *obd)
+{
+        struct list_head *tmp;
+        int inserted = 0;
+        __u64 transno = req->rq_reqmsg->transno;
+        struct ptlrpc_request *saved_req;
+
+        if (!transno) {
+                INIT_LIST_HEAD(&req->rq_list);
+                DEBUG_REQ(D_HA, req, "not queueing");
+                return 1;
+        }
+
+        spin_lock_bh(&obd->obd_processing_task_lock);
+
+        if (obd->obd_processing_task == current->pid) {
+                /* Processing the queue right now, don't re-add. */
+                LASSERT(list_empty(&req->rq_list));
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                return 1;
+        }
+
+        OBD_ALLOC(saved_req, sizeof *saved_req);
+        if (!saved_req)
+                LBUG();
+        memcpy(saved_req, req, sizeof *req);
+        req = saved_req;
+        INIT_LIST_HEAD(&req->rq_list);
+
+        /* XXX O(n^2) */
+        list_for_each(tmp, &obd->obd_recovery_queue) {
+                struct ptlrpc_request *reqiter =
+                        list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                if (reqiter->rq_reqmsg->transno > transno) {
+                        list_add_tail(&req->rq_list, &reqiter->rq_list);
+                        inserted = 1;
+                        break;
+                }
+        }
+
+        if (!inserted) {
+                list_add_tail(&req->rq_list, &obd->obd_recovery_queue);
+        }
+
+        if (obd->obd_processing_task != 0) {
+                /* Someone else is processing this queue, we'll leave it to
+                 * them.
+                 */
+                if (transno == obd->obd_next_recovery_transno)
+                        wake_up(&obd->obd_next_transno_waitq);
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                return 0;
+        }
+
+        /* Nobody is processing, and we know there's (at least) one to process
+         * now, so we'll do the honours.
+         */
+        obd->obd_processing_task = current->pid;
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+        process_recovery_queue(obd);
+        return 0;
+}
+
+struct obd_device * target_req2obd(struct ptlrpc_request *req)
+{
+        return req->rq_export->exp_obd;
+}
+
+int target_queue_final_reply(struct ptlrpc_request *req, int rc)
+{
+        struct obd_device *obd = target_req2obd(req);
+        struct ptlrpc_request *saved_req;
+
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        if (rc) {
+                /* Just like ptlrpc_error, but without the sending. */
+                lustre_pack_msg(0, NULL, NULL, &req->rq_replen,
+                                &req->rq_repmsg);
+                req->rq_type = PTL_RPC_MSG_ERR;
+        }
+
+        LASSERT(list_empty(&req->rq_list));
+        OBD_ALLOC(saved_req, sizeof *saved_req);
+        memcpy(saved_req, req, sizeof *saved_req);
+        req = saved_req;
+        list_add(&req->rq_list, &obd->obd_delayed_reply_queue);
+        if (--obd->obd_recoverable_clients == 0) {
+                struct list_head *tmp, *n;
+                ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace);
+                CDEBUG(D_ERROR,
+                       "all clients recovered, sending delayed replies\n");
+                obd->obd_flags &= ~OBD_RECOVERING;
+                list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
+                        req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                        DEBUG_REQ(D_ERROR, req, "delayed:");
+                        ptlrpc_reply(req->rq_svc, req);
+                        list_del(&req->rq_list);
+                        OBD_FREE(req, sizeof *req);
+                }
+                cancel_recovery_timer(obd);
+        } else {
+                CERROR("%d recoverable clients remain\n",
+                       obd->obd_recoverable_clients);
+        }
+
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+        return 1;
+}
diff --git a/lustre/liblustre/.cvsignore b/lustre/liblustre/.cvsignore
new file mode 100644 (file)
index 0000000..fb1a186
--- /dev/null
@@ -0,0 +1,9 @@
+.Xrefs
+config.log
+config.status
+configure
+Makefile
+Makefile.in
+.deps
+TAGS
+libtest
diff --git a/lustre/liblustre/Makefile.am b/lustre/liblustre/Makefile.am
new file mode 100644 (file)
index 0000000..c761a22
--- /dev/null
@@ -0,0 +1,18 @@
+# Administration utilities Makefile
+DEFS=
+
+CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(PORTALS)/include  -I$(srcdir)/../include -Wall -L$(PORTALSLIB)
+
+KFLAGS:=
+CPPFLAGS = $(HAVE_LIBREADLINE)
+LIBS=
+LLIBS= ../lov/liblov.a ../obdecho/libobdecho.a ../osc/libosc.a ../ldlm/libldlm.a  ../ptlrpc/libptlrpc.a ../obdclass/liblustreclass.a
+
+libtest_LDADD := $(LIBREADLINE)  $(LLIBS) \
+                 $(PORTALS)/user/procbridge/libprocbridge.a  $(PORTALS)/user/tcpnal/libtcpnal.a \
+                $(PORTALS)/user/util/libtcpnalutil.a $(PORTALS)/user/$(PORTALS)/api/libptlapi.a \
+                 $(PORTALS)/lib/libptllib.a -lptlctl -lpthread -lefence
+bin_PROGRAMS = libtest
+libtest_SOURCES = libtest.c
+
+include $(top_srcdir)/Rules
diff --git a/lustre/liblustre/libtest.c b/lustre/liblustre/libtest.c
new file mode 100644 (file)
index 0000000..2941398
--- /dev/null
@@ -0,0 +1,114 @@
+#include <stdio.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+
+#include <portals/api-support.h> /* needed for ptpctl.h */
+#include <portals/ptlctl.h>    /* needed for parse_dump */
+
+
+#include <liblustre.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <../user/procbridge/procbridge.h>
+
+ptl_handle_ni_t         tcpnal_ni;
+
+struct pingcli_args {
+        ptl_nid_t mynid;
+        ptl_nid_t nid;
+       ptl_pid_t port;
+        int count;
+        int size;
+};
+
+struct task_struct *current;
+
+struct obd_class_user_state ocus;
+
+/* portals interfaces */
+inline const ptl_handle_ni_t *
+kportal_get_ni (int nal)
+{
+        return &tcpnal_ni;
+}
+
+inline void
+kportal_put_ni (int nal)
+{
+        return;
+}
+
+void init_current(int argc, char **argv)
+{ 
+        current = malloc(sizeof(*current));
+        strncpy(current->comm, argv[0], sizeof(current->comm));
+        current->pid = getpid();
+
+}
+
+ptl_nid_t tcpnal_mynid;
+
+int init_lib_portals(struct pingcli_args *args)
+{
+        int rc;
+
+        PtlInit();
+        tcpnal_mynid = args->mynid;
+        rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni);
+        if (rc != 0) {
+                CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
+                PtlFini();
+                RETURN (rc);
+        }
+        PtlNIDebug(tcpnal_ni, ~0);
+        return rc;
+}
+
+extern int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, unsigned long arg);
+
+
+int lib_ioctl(int dev_id, int opc, void * ptr)
+{
+
+       if (dev_id == OBD_DEV_ID) {
+                struct obd_ioctl_data *ioc = ptr;
+               class_handle_ioctl(&ocus, opc, (unsigned long)ptr);
+
+               /* you _may_ need to call obd_ioctl_unpack or some
+                  other verification function if you want to use ioc
+                  directly here */
+               printf ("processing ioctl cmd: %x buf len: %d\n", 
+                       opc,  ioc->ioc_len);
+       }
+       return (0);
+}
+
+int main(int argc, char **argv) 
+{
+        struct pingcli_args *args;
+       args= malloc(sizeof(*args));
+        if (!args) { 
+                printf("Malloc error\n");
+                exit(1);
+        }
+
+       args->mynid   = ntohl (inet_addr (argv[1]));
+        INIT_LIST_HEAD(&ocus.ocus_conns);
+
+        init_current(argc, argv);
+        init_obdclass();
+        init_lib_portals(args);
+        ptlrpc_init();
+        ldlm_init();
+        osc_init();
+        echo_client_init();
+        /* XXX  need mdc_getlovinfo before lov_init can work.. */
+        //        lov_init();
+
+       parse_dump("/tmp/DUMP_FILE", lib_ioctl);
+
+        printf("Hello\n");
+        return 0;
+}
+
index a62716b..0e17c1a 100644 (file)
@@ -72,7 +72,11 @@ static int ll_commitcbd_main(void *arg)
         unlock_kernel();
 
         /* Record that the  thread is running */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         sbi->ll_commitcbd_waketime = CURRENT_TIME;
+#else
+        sbi->ll_commitcbd_waketime = CURRENT_TIME.tv_sec;
+#endif
         sbi->ll_commitcbd_timeout = 10 * HZ;
         sbi->ll_commitcbd_thread = current;
         sbi->ll_commitcbd_flags =  LL_COMMITCBD_RUNNING;
index 0286cc6..41c68d9 100644 (file)
@@ -62,14 +62,10 @@ void ll_intent_release(struct dentry *de, struct lookup_intent *it)
         ENTRY;
 
         LASSERT(ll_d2d(de) != NULL);
-        mdc_put_rpc_lock(&mdc_rpc_lock, it);
 
         if (it->it_lock_mode) {
                 handle = (struct lustre_handle *)it->it_lock_handle;
-                if (it->it_op == IT_SETATTR)
-                        ldlm_lock_decref_and_cancel(handle, it->it_lock_mode);
-                else
-                        ldlm_lock_decref(handle, it->it_lock_mode);
+                ldlm_lock_decref(handle, it->it_lock_mode);
 
                 /* intent_release may be called multiple times, from
                    this thread and we don't want to double-decref this
@@ -159,9 +155,6 @@ int ll_revalidate2(struct dentry *de, int flags, struct lookup_intent *it)
                 RETURN(0);
         }
 
-        if (it && it->it_op == IT_TRUNC)
-                it->it_op = IT_SETATTR;
-
         if (it == NULL || it->it_op == IT_GETATTR) {
                 /* We could just return 1 immediately, but since we should only
                  * be called in revalidate2 if we already have a lock, let's
@@ -209,9 +202,7 @@ int ll_revalidate2(struct dentry *de, int flags, struct lookup_intent *it)
         }
 
         rc = ll_intent_lock(de->d_parent->d_inode, &de, it, revalidate2_finish);
-        if (rc == -ESTALE)
-                RETURN(0);
-        if (rc < 0 && it->it_status) {
+        if (rc < 0) {
                 CERROR("ll_intent_lock: rc %d : it->it_status %d\n", rc,
                        it->it_status);
                 RETURN(0);
index 072eeea..21192aa 100644 (file)
@@ -55,8 +55,10 @@ typedef struct ext2_dir_entry_2 ext2_dirent;
 #define SetPageChecked(page)     set_bit(PG_checked, &(page)->flags)
 
 
-static int ll_dir_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to)
+static int ll_dir_prepare_write(struct file *file, struct page *page,
+                                unsigned from, unsigned to)
 {
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         return 0;
 }
 
@@ -75,6 +77,7 @@ static int ll_dir_readpage(struct file *file, struct page *page)
 
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if ((inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT <= page->index){
                 /* XXX why do we need this exactly, and why do we think that
                  *     an all-zero directory page is useful?
@@ -124,11 +127,10 @@ static int ll_dir_readpage(struct file *file, struct page *page)
 
         unlock_page(page);
         ll_unlock(LCK_PR, &lockh);
-        mdc_put_rpc_lock(&mdc_rpc_lock, &it);
         if (rc != ELDLM_OK)
                 CERROR("ll_unlock: err: %d\n", rc);
         return rc;
-} /* ll_dir_readpage */
+}
 
 struct address_space_operations ll_dir_aops = {
         readpage: ll_dir_readpage,
@@ -185,7 +187,9 @@ static int ext2_commit_chunk(struct page *page, unsigned from, unsigned to)
         loff_t new_size = (page->index << PAGE_CACHE_SHIFT) + to;
         int err = 0;
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         dir->i_version = ++event;
+#endif
         if (new_size > dir->i_size)
                 dir->i_size = new_size;
         SetPageUptodate(page);
@@ -394,6 +398,7 @@ int ll_readdir(struct file * filp, void * dirent, filldir_t filldir)
         int need_revalidate = (filp->f_version != inode->i_version);
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
                 GOTO(done, 0);
 
@@ -759,6 +764,7 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
         struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct obd_ioctl_data *data;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
 
         switch(cmd) {
         case IOC_MDC_LOOKUP: {
index 1e26110..ff5d1d6 100644 (file)
@@ -91,8 +91,8 @@ static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
         } else {
                 /* No transno means that we can just drop our ref. */
                 spin_unlock_irqrestore(&imp->imp_lock, flags);
-                ptlrpc_req_finished(fd->fd_req);
         }
+        ptlrpc_req_finished(fd->fd_req);
 
         /* Do this after the fd_req->rq_transno check, because we don't want
          * to bounce off zero references. */
@@ -129,29 +129,20 @@ static int ll_file_release(struct inode *inode, struct file *file)
                 oa.o_id = lsm->lsm_object_id;
                 oa.o_mode = S_IFREG;
                 oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID;
-                obd_handle2oa(&oa, &fd->fd_osthandle);
+
+                memcpy(&oa.o_inline, fd->fd_ostdata, FD_OSTDATA_SIZE);
+                oa.o_valid |= OBD_MD_FLHANDLE;
+
                 rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
                 if (rc)
                         CERROR("inode %lu object close failed: rc = %d\n",
                                inode->i_ino, rc);
         }
 
-        mdc_get_rpc_lock(&mdc_rpc_lock, NULL);
         rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
-        mdc_put_rpc_lock(&mdc_rpc_lock, NULL);
         if (rc2 && !rc)
                 rc = rc2;
 
-        if (atomic_dec_and_test(&lli->lli_open_count)) {
-                CDEBUG(D_INFO, "last close, cancelling unused locks\n");
-                rc2 = obd_cancel_unused(&sbi->ll_osc_conn, lsm, 0);
-                if (rc2 && !rc) {
-                        rc = rc2;
-                        CERROR("obd_cancel_unused: %d\n", rc);
-                }
-        } else
-                CDEBUG(D_INFO, "not last close, not cancelling unused locks\n");
-
         RETURN(rc);
 }
 
@@ -191,22 +182,23 @@ static int ll_osc_open(struct lustre_handle *conn, struct inode *inode,
                 RETURN(-ENOMEM);
         oa->o_id = lsm->lsm_object_id;
         oa->o_mode = S_IFREG;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
-                OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+        oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
+                       OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME);
         rc = obd_open(conn, oa, lsm, NULL);
         if (rc)
                 GOTO(out, rc);
 
         file->f_flags &= ~O_LOV_DELAY_CREATE;
-        obdo_to_inode(inode, oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
-                                OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+        obdo_to_inode(inode, oa, (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+                                  OBD_MD_FLMTIME | OBD_MD_FLCTIME));
 
-        obd_oa2handle(&fd->fd_osthandle, oa);
+        if (oa->o_valid |= OBD_MD_FLHANDLE)
+                memcpy(fd->fd_ostdata, obdo_handle(oa), FD_OSTDATA_SIZE);
 
-        atomic_inc(&ll_i2info(inode)->lli_open_count);
+        EXIT;
 out:
         obdo_free(oa);
-        RETURN(rc);
+        return rc;
 }
 
 /* Caller must hold lli_open_sem to protect lli->lli_smd from changing and
@@ -220,9 +212,9 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
         struct ptlrpc_request *req = NULL;
         struct ll_inode_info *lli = ll_i2info(inode);
         struct lov_mds_md *lmm = NULL;
-        int lmm_size = 0;
         struct obdo *oa;
-        int rc, err;
+        struct iattr iattr;
+        int rc, err, lmm_size = 0;;
         ENTRY;
 
         oa = obdo_alloc();
@@ -259,7 +251,9 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
         lmm_size = rc;
 
         /* Save the stripe MD with this file on the MDS */
-        rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, inode, NULL,
+        memset(&iattr, 0, sizeof(iattr));
+        iattr.ia_valid = ATTR_FROM_OPEN;
+        rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, inode, &iattr,
                          lmm, lmm_size, &req);
         ptlrpc_req_finished(req);
 
@@ -298,8 +292,8 @@ out_destroy:
  * lli_open_sem to ensure no other process will create objects, send the
  * stripe MD to the MDS, or try to destroy the objects if that fails.
  *
- * If we already have the stripe MD locally, we don't request it in
- * mdc_open() by passing a lmm_size = 0.
+ * If we already have the stripe MD locally then we don't request it in
+ * mdc_open(), by passing a lmm_size = 0.
  *
  * It is up to the application to ensure no other processes open this file
  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
@@ -319,6 +313,7 @@ static int ll_file_open(struct inode *inode, struct file *file)
         int rc = 0;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(file->f_dentry, it);
         rc = ll_it_open_error(IT_OPEN_OPEN, it);
         if (rc)
@@ -411,8 +406,7 @@ int ll_size_unlock(struct inode *inode, struct lov_stripe_md *lsm, int mode,
  *     keeps an atomic flag in the inode which indicates whether the size
  *     has been updated (see bug 280).
  */
-int ll_file_size(struct inode *inode, struct lov_stripe_md *lsm,
-                 struct lustre_handle *handle)
+int ll_file_size(struct inode *inode, struct lov_stripe_md *lsm, char *ostdata)
 {
         struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct obdo oa;
@@ -427,12 +421,17 @@ int ll_file_size(struct inode *inode, struct lov_stripe_md *lsm,
         oa.o_mode = S_IFREG;
         oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
                 OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
-        obd_handle2oa(&oa, handle);
+
+        if (ostdata != NULL) {
+                memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE);
+                oa.o_valid |= OBD_MD_FLHANDLE;
+        }
+
         rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
         if (!rc) {
                 obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
                                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-                CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu\n",
+                CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lx\n",
                        lsm->lsm_object_id, inode->i_size, inode->i_size);
         }
 
@@ -459,7 +458,11 @@ static void ll_update_atime(struct inode *inode)
 #ifdef USE_ATIME
         struct iattr attr;
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         attr.ia_atime = CURRENT_TIME;
+#else
+        attr.ia_atime = CURRENT_TIME.tv_sec;
+#endif
         attr.ia_valid = ATTR_ATIME;
 
         if (inode->i_atime == attr.ia_atime) return;
@@ -481,6 +484,7 @@ int ll_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
         struct lustre_handle lockh = { 0, 0 };
         int rc;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
 
         if (inode == NULL)
                 LBUG();
@@ -519,11 +523,12 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
         ssize_t retval;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) &&
             !(sbi->ll_flags & LL_SBI_NOLCK)) {
                 struct ldlm_extent extent;
                 extent.start = *ppos;
-                extent.end = *ppos + count;
+                extent.end = *ppos + count - 1;
                 CDEBUG(D_INFO, "Locking inode %lu, start "LPU64" end "LPU64"\n",
                        inode->i_ino, extent.start, extent.end);
 
@@ -538,8 +543,8 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
         }
 
         /* If we don't refresh the file size, generic_file_read may not even
-         * call us */
-        retval = ll_file_size(inode, lsm, &fd->fd_osthandle);
+         * call ll_readpage */
+        retval = ll_file_size(inode, lsm, fd->fd_ostdata);
         if (retval < 0) {
                 CERROR("ll_file_size: "LPSZ"\n", retval);
                 RETURN(retval);
@@ -580,13 +585,18 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
         ssize_t retval;
         ENTRY;
 
+        /* POSIX, but surprised the VFS doesn't check this already */
+        if (count == 0)
+                return 0;
+
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) {
                 err = ll_size_lock(inode, lsm, 0, LCK_PW, &eof_lockh);
                 if (err)
                         RETURN(err);
 
                 /* Get size here so we know extent to enqueue write lock on. */
-                retval = ll_file_size(inode, lsm, &fd->fd_osthandle);
+                retval = ll_file_size(inode, lsm, fd->fd_ostdata);
                 if (retval)
                         GOTO(out_eof, retval);
 
@@ -597,7 +607,7 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
             !(sbi->ll_flags & LL_SBI_NOLCK)) {
                 struct ldlm_extent extent;
                 extent.start = *ppos;
-                extent.end = *ppos + count;
+                extent.end = *ppos + count - 1;
                 CDEBUG(D_INFO, "Locking inode %lu, start "LPU64" end "LPU64"\n",
                        inode->i_ino, extent.start, extent.end);
 
@@ -692,9 +702,12 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
         struct lustre_handle *conn;
         int flags;
 
-        switch(cmd) {
-        case TCGETS:
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
+
+        if ((cmd & 0xffffff00) == ((int)'T') << 8) /* tty ioctls */
                 return -ENOTTY;
+
+        switch(cmd) {
         case LL_IOC_GETFLAGS:
                 /* Get the current value of the file flags */
                 return put_user(fd->fd_flags, (int *)arg);
@@ -739,12 +752,13 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
         long long retval;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         switch (origin) {
         case 2: {
                 struct ll_inode_info *lli = ll_i2info(inode);
                 struct ll_file_data *fd = file->private_data;
 
-                retval = ll_file_size(inode, lli->lli_smd, &fd->fd_osthandle);
+                retval = ll_file_size(inode, lli->lli_smd, fd->fd_ostdata);
                 if (retval)
                         RETURN(retval);
 
@@ -760,8 +774,8 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
                         file->f_pos = offset;
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
                         file->f_reada = 0;
-#endif
                         file->f_version = ++event;
+#endif
                 }
                 retval = offset;
         }
@@ -781,6 +795,7 @@ int ll_inode_revalidate(struct dentry *dentry)
         struct lov_stripe_md *lsm;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (!inode) {
                 CERROR("REPORT THIS LINE TO PETER\n");
                 RETURN(0);
@@ -814,6 +829,13 @@ int ll_inode_revalidate(struct dentry *dentry)
                 }
 
                 body = lustre_msg_buf(req->rq_repmsg, 0);
+
+                if (S_ISREG(inode->i_mode) &&
+                    body->valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) {
+                        CERROR("MDS sent back size for regular file\n");
+                        body->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+                }
+
                 if (body->valid & OBD_MD_FLEASIZE)
                         ll_update_inode(inode, body,
                                         lustre_msg_buf(req->rq_repmsg, 1));
@@ -836,7 +858,26 @@ int ll_inode_revalidate(struct dentry *dentry)
 static int ll_getattr(struct vfsmount *mnt, struct dentry *de,
                       struct kstat *stat)
 {
-        return ll_inode_revalidate(de);
+        int res = 0;
+        struct inode *inode = de->d_inode;
+
+        res = ll_inode_revalidate(de);
+        if (res)
+                return res;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        stat->dev = inode->i_dev;
+#endif
+        stat->ino = inode->i_ino;
+        stat->mode = inode->i_mode;
+        stat->nlink = inode->i_nlink;
+        stat->uid = inode->i_uid;
+        stat->gid = inode->i_gid;
+        stat->rdev = kdev_t_to_nr(inode->i_rdev);
+        stat->atime = inode->i_atime;
+        stat->mtime = inode->i_mtime;
+        stat->ctime = inode->i_ctime;
+        stat->size = inode->i_size;
+        return 0;
 }
 #endif
 
@@ -852,6 +893,7 @@ struct file_operations ll_file_operations = {
 };
 
 struct inode_operations ll_file_inode_operations = {
+        setattr_raw:    ll_setattr_raw,
         setattr:    ll_setattr,
         truncate:   ll_truncate,
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
@@ -862,6 +904,7 @@ struct inode_operations ll_file_inode_operations = {
 };
 
 struct inode_operations ll_special_inode_operations = {
+        setattr_raw:    ll_setattr_raw,
         setattr:    ll_setattr,
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
         getattr:    ll_getattr,
index 8989a82..b5e6620 100644 (file)
  */
 #define DEBUG_SUBSYSTEM S_LLITE
 
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
+#endif
 #include <linux/lustre_lite.h>
 #include <linux/lprocfs_status.h>
 
@@ -34,20 +38,27 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
 }
 #else
 
-long long mnt_instance;
-
-static inline int lprocfs_llite_statfs(void *data, struct statfs *sfs)
-{
-        struct super_block *sb = (struct super_block*)data;
-        return (sb->s_op->statfs)(sb, sfs);
+#define LPROC_LLITE_STAT_FCT(fct_name, get_statfs_fct)                    \
+int fct_name(char *page, char **start, off_t off,                         \
+             int count, int *eof, void *data)                             \
+{                                                                         \
+        struct statfs sfs;                                                \
+        int rc;                                                           \
+        LASSERT(data != NULL);                                            \
+        rc = get_statfs_fct((struct super_block*)data, &sfs);             \
+        return (rc==0                                                     \
+                ? lprocfs_##fct_name (page, start, off, count, eof, &sfs) \
+                : rc);                                                    \
 }
 
-DEFINE_LPROCFS_STATFS_FCT(rd_blksize,     lprocfs_llite_statfs);
-DEFINE_LPROCFS_STATFS_FCT(rd_kbytestotal, lprocfs_llite_statfs);
-DEFINE_LPROCFS_STATFS_FCT(rd_kbytesfree,  lprocfs_llite_statfs);
-DEFINE_LPROCFS_STATFS_FCT(rd_filestotal,  lprocfs_llite_statfs);
-DEFINE_LPROCFS_STATFS_FCT(rd_filesfree,   lprocfs_llite_statfs);
-DEFINE_LPROCFS_STATFS_FCT(rd_filegroups,  lprocfs_llite_statfs);
+long long mnt_instance;
+
+LPROC_LLITE_STAT_FCT(rd_blksize,     vfs_statfs);
+LPROC_LLITE_STAT_FCT(rd_kbytestotal, vfs_statfs);
+LPROC_LLITE_STAT_FCT(rd_kbytesfree,  vfs_statfs);
+LPROC_LLITE_STAT_FCT(rd_filestotal,  vfs_statfs);
+LPROC_LLITE_STAT_FCT(rd_filesfree,   vfs_statfs);
+LPROC_LLITE_STAT_FCT(rd_filegroups,  vfs_statfs);
 
 int rd_path(char *page, char **start, off_t off, int count, int *eof,
             void *data)
@@ -60,6 +71,7 @@ int rd_fstype(char *page, char **start, off_t off, int count, int *eof,
 {
         struct super_block *sb = (struct super_block*)data;
 
+        LASSERT(sb != NULL);
         *eof = 1;
         return snprintf(page, count, "%s\n", sb->s_type->name);
 }
@@ -69,6 +81,7 @@ int rd_sb_uuid(char *page, char **start, off_t off, int count, int *eof,
 {
         struct super_block *sb = (struct super_block *)data;
 
+        LASSERT(sb != NULL);
         *eof = 1;
         return snprintf(page, count, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
 }
@@ -103,14 +116,20 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
         name[MAX_STRING_SIZE] = '\0';
         lvars[0].name = name;
 
+        LASSERT(sbi != NULL);
+        LASSERT(mdc != NULL);
+        LASSERT(osc != NULL);
+
         /* Mount info */
         snprintf(name, MAX_STRING_SIZE, "fs%llu", mnt_instance);
 
         mnt_instance++;
         sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
-        if (IS_ERR(sbi->ll_proc_root))
-                RETURN(err = PTR_ERR(sbi->ll_proc_root));
-
+        if (IS_ERR(sbi->ll_proc_root)) {
+                err = PTR_ERR(sbi->ll_proc_root);
+                sbi->ll_proc_root = NULL;
+                RETURN(err);
+        }
         /* Static configuration info */
         err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_obd_vars, sb);
         if (err)
@@ -119,6 +138,11 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
         /* MDC info */
         strncpy(uuid.uuid, mdc, sizeof(uuid.uuid));
         obd = class_uuid2obd(&uuid);
+
+        LASSERT(obd != NULL);
+        LASSERT(obd->obd_type != NULL);
+        LASSERT(obd->obd_type->typ_name != NULL);
+
         snprintf(name, MAX_STRING_SIZE, "%s/common_name",
                  obd->obd_type->typ_name);
         lvars[0].read_fptr = lprocfs_rd_name;
@@ -136,6 +160,10 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
         strncpy(uuid.uuid, osc, sizeof(uuid.uuid));
         obd = class_uuid2obd(&uuid);
 
+        LASSERT(obd != NULL);
+        LASSERT(obd->obd_type != NULL);
+        LASSERT(obd->obd_type->typ_name != NULL);
+
         snprintf(name, MAX_STRING_SIZE, "%s/common_name",
                  obd->obd_type->typ_name);
         lvars[0].read_fptr = lprocfs_rd_name;
index efd5e0d..449cac7 100644 (file)
@@ -151,7 +151,7 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash,
 static int ll_intent_to_lock_mode(struct lookup_intent *it)
 {
         /* CREAT needs to be tested before open (both could be set) */
-        if (it->it_op & (IT_CREAT | IT_SETATTR))
+        if (it->it_op & IT_CREAT)
                 return LCK_PW;
         else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP))
                 return LCK_PR;
@@ -201,6 +201,10 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
         obd_id ino = 0;
         ENTRY;
 
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+        if (it && it->it_op == 0)
+                *it = lookup_it;
+#endif
         if (it == NULL)
                 it = &lookup_it;
 
@@ -294,6 +298,8 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
 
                         if (it->it_disposition & IT_OPEN_CREATE)
                                 ptlrpc_request_addref(request);
+                        if (it->it_disposition & IT_OPEN_OPEN)
+                                ptlrpc_request_addref(request);
 
                         if (it->it_disposition & IT_OPEN_NEG)
                                 flag = LL_LOOKUP_NEGATIVE;
@@ -313,7 +319,7 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
                                 flag = LL_LOOKUP_NEGATIVE;
                         else
                                 flag = LL_LOOKUP_POSITIVE;
-                } else if (it->it_op & (IT_GETATTR | IT_SETATTR | IT_LOOKUP)) {
+                } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
                         /* For check ops, we want the lookup to succeed */
                         it->it_data = NULL;
                         if (it->it_status)
@@ -420,8 +426,8 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
                         list_del_init(&dentry->d_lru);
 
                 list_del_init(&dentry->d_hash);
+                __d_rehash(dentry, 0); /* avoid taking dcache_lock inside */
                 spin_unlock(&dcache_lock);
-                d_rehash(dentry);
                 atomic_inc(&dentry->d_count);
                 iput(inode);
                 dentry->d_flags &= ~DCACHE_LUSTRE_INVALID;
@@ -491,9 +497,7 @@ static struct dentry *ll_lookup2(struct inode *parent, struct dentry *dentry,
         int rc;
         ENTRY;
 
-        if (it && it->it_op == IT_TRUNC)
-                it->it_op = IT_SETATTR;
-
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         rc = ll_intent_lock(parent, &dentry, it, lookup2_finish);
         if (rc < 0) {
                 CDEBUG(D_INFO, "ll_intent_lock: %d\n", rc);
@@ -515,7 +519,11 @@ static struct inode *ll_create_node(struct inode *dir, const char *name,
         struct inode *inode;
         struct ptlrpc_request *request = NULL;
         struct mds_body *body;
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+        time_t time = CURRENT_TIME.tv_sec;
+#else
         time_t time = CURRENT_TIME;
+#endif
         struct ll_sb_info *sbi = ll_i2sbi(dir);
         struct ll_read_inode2_cookie lic = { .lic_lmm = NULL, };
         ENTRY;
@@ -600,7 +608,6 @@ static int ll_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode,
 
         err = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, &it, LCK_EX, dir,
                          NULL, &lockh, NULL, 0, &data, sizeof(data));
-        mdc_put_rpc_lock(&mdc_rpc_lock, &it); 
         request = (struct ptlrpc_request *)it.it_data;
         if (err < 0)
                 GOTO(out, err);
@@ -663,6 +670,7 @@ static int ll_create(struct inode *dir, struct dentry *dentry, int mode)
         int rc = 0;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         it = dentry->d_it;
 
         rc = ll_it_open_error(IT_OPEN_CREATE, it);
@@ -694,11 +702,16 @@ static int ll_mknod2(struct inode *dir, const char *name, int len, int mode,
                      int rdev)
 {
         struct ptlrpc_request *request = NULL;
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+        time_t time = CURRENT_TIME.tv_sec;
+#else
         time_t time = CURRENT_TIME;
+#endif
         struct ll_sb_info *sbi = ll_i2sbi(dir);
         int err = -EMLINK;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (dir->i_nlink >= EXT2_LINK_MAX)
                 RETURN(err);
 
@@ -730,6 +743,7 @@ static int ll_mknod(struct inode *dir, struct dentry *dentry, int mode,
         struct inode *inode;
         int rc = 0;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(dentry, it);
 
         if ((mode & S_IFMT) == 0)
@@ -753,11 +767,16 @@ static int ll_symlink2(struct inode *dir, const char *name, int len,
                        const char *tgt)
 {
         struct ptlrpc_request *request = NULL;
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+        time_t time = CURRENT_TIME.tv_sec;
+#else
         time_t time = CURRENT_TIME;
+#endif
         struct ll_sb_info *sbi = ll_i2sbi(dir);
         int err = -EMLINK;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (dir->i_nlink >= EXT2_LINK_MAX)
                 RETURN(err);
 
@@ -778,6 +797,7 @@ static int ll_symlink(struct inode *dir, struct dentry *dentry,
         int err = 0;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(dentry, it);
 
         inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len,
@@ -815,6 +835,7 @@ static int ll_link2(struct inode *src, struct inode *dir,
 
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         err = mdc_link(&sbi->ll_mdc_conn, src, dir, name, len, &request);
         ptlrpc_req_finished(request);
 
@@ -828,12 +849,17 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir,
         struct inode *inode = old_dentry->d_inode;
         int rc;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(dentry, it);
 
         if (it && it->it_disposition) {
                 if (it->it_status)
                         RETURN(it->it_status);
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+                inode->i_ctime.tv_sec = CURRENT_TIME.tv_sec;
+#else
                 inode->i_ctime = CURRENT_TIME;
+#endif
                 ext2_inc_count(inode);
                 atomic_inc(&inode->i_count);
                 d_instantiate(dentry, inode);
@@ -852,7 +878,11 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir,
         if (rc)
                 RETURN(rc);
 
-        inode->i_ctime = CURRENT_TIME;
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+                inode->i_ctime.tv_sec = CURRENT_TIME.tv_sec;
+#else
+                inode->i_ctime = CURRENT_TIME;
+#endif
         ext2_inc_count(inode);
         atomic_inc(&inode->i_count);
 
@@ -862,11 +892,16 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir,
 static int ll_mkdir2(struct inode *dir, const char *name, int len, int mode)
 {
         struct ptlrpc_request *request = NULL;
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+        time_t time = CURRENT_TIME.tv_sec;
+#else
         time_t time = CURRENT_TIME;
+#endif
         struct ll_sb_info *sbi = ll_i2sbi(dir);
         int err = -EMLINK;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (dir->i_nlink >= EXT2_LINK_MAX)
                 RETURN(err);
 
@@ -886,6 +921,7 @@ static int ll_mkdir(struct inode *dir, struct dentry *dentry, int mode)
         int err = -EMLINK;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(dentry, it);
 
         if (dir->i_nlink >= EXT2_LINK_MAX)
@@ -932,6 +968,7 @@ static int ll_rmdir2(struct inode *dir, const char *name, int len)
         int rc;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         rc = ll_mdc_unlink(dir, NULL, S_IFDIR, name, len);
         RETURN(rc);
 }
@@ -941,6 +978,7 @@ static int ll_unlink2(struct inode *dir, const char *name, int len)
         int rc;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         rc = ll_mdc_unlink(dir, NULL, S_IFREG, name, len);
         RETURN(rc);
 }
@@ -992,6 +1030,7 @@ static int ll_unlink(struct inode *dir, struct dentry *dentry)
         struct lookup_intent * it;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(dentry, it);
 
         RETURN(ll_common_unlink(dir, dentry, it, S_IFREG));
@@ -1003,7 +1042,8 @@ static int ll_rmdir(struct inode *dir, struct dentry *dentry)
         struct lookup_intent *it;
         int rc;
         ENTRY;
-
+        
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(dentry, it);
 
         if ((!it || !it->it_disposition) && !ext2_empty_dir(inode))
@@ -1027,7 +1067,8 @@ static int ll_rename2(struct inode *src, struct inode *tgt,
         struct ll_sb_info *sbi = ll_i2sbi(src);
         int err;
         ENTRY;
-
+        
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         err = mdc_rename(&sbi->ll_mdc_conn, src, tgt,
                          oldname, oldlen, newname, newlen, &request);
         ptlrpc_req_finished(request);
@@ -1049,6 +1090,7 @@ static int ll_rename(struct inode * old_dir, struct dentry * old_dentry,
         struct page * old_page;
         int err;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(new_dentry, it);
 
         if (it && it->it_disposition) {
@@ -1152,5 +1194,8 @@ struct inode_operations ll_dir_inode_operations = {
         rename:          ll_rename,
         rename2:         ll_rename2,
         setattr:         ll_setattr,
+        setattr_raw:     ll_setattr_raw,
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         revalidate:      ll_inode_revalidate,
+#endif
 };
index ab3ff86..6818ace 100644 (file)
@@ -26,7 +26,6 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/iobuf.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/unistd.h>
 #include <asm/uaccess.h>
 
 #include <linux/fs.h>
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#include <linux/buffer_head.h>
+#else
+#include <linux/iobuf.h>
+#endif
 #include <linux/stat.h>
 #include <asm/uaccess.h>
 #include <asm/segment.h>
@@ -146,16 +150,31 @@ static int ll_readpage(struct file *file, struct page *page)
                 LBUG();
 
         if (inode->i_size <= offset) {
+                CERROR("reading beyond EOF\n");
                 memset(kmap(page), 0, PAGE_SIZE);
                 kunmap(page);
                 GOTO(readpage_out, rc);
         }
 
+        /* XXX Workaround for BA OSTs returning short reads at EOF.  The linux
+         *     OST will return the full page, zero-filled at the end, which
+         *     will just overwrite the data we set here.
+         *     Bug 593 relates to fixing this properly.
+         */
+        if (inode->i_size < offset + PAGE_SIZE) {
+                int count = inode->i_size - offset;
+                void *addr = kmap(page);
+                //POISON(addr, 0x7c, count);
+                memset(addr + count, 0, PAGE_SIZE - count);
+                kunmap(page);
+        }
+
         if (PageUptodate(page)) {
                 CERROR("Explain this please?\n");
                 GOTO(readpage_out, rc);
         }
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         rc = ll_brw(OBD_BRW_READ, inode, page, 0);
         EXIT;
 
@@ -184,6 +203,7 @@ void ll_truncate(struct inode *inode)
         oa.o_mode = inode->i_mode;
         oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after %Lu)\n",
                oa.o_id, inode->i_size);
 
@@ -209,7 +229,7 @@ void ll_truncate(struct inode *inode)
         return;
 } /* ll_truncate */
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+//#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 
 static int ll_prepare_write(struct file *file, struct page *page, unsigned from,
                             unsigned to)
@@ -221,16 +241,18 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from,
         ENTRY;
 
         addr = kmap(page);
-        if (!PageLocked(page))
-                LBUG();
+        LASSERT(PageLocked(page));
+
+        if (PageUptodate(page))
+                RETURN(0);
 
-        if (Page_Uptodate(page))
-                GOTO(prepare_done, rc);
+        //POISON(addr + from, 0xca, to - from);
 
         /* We're completely overwriting an existing page, so _don't_ set it up
          * to date until commit_write */
         if (from == 0 && to == PAGE_SIZE)
                 RETURN(0);
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
 
         /* If are writing to a new page, no need to read old data.  If we
          * haven't already gotten the file size in ll_file_write() since
@@ -242,7 +264,7 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from,
                         struct ll_file_data *fd = file->private_data;
                         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 
-                        rc = ll_file_size(inode, lsm, &fd->fd_osthandle);
+                        rc = ll_file_size(inode, lsm, fd->fd_ostdata);
                         if (rc)
                                 GOTO(prepare_done, rc);
                 }
@@ -260,7 +282,7 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from,
                 SetPageUptodate(page);
         else
                 kunmap (page);
-        
+
         return rc;
 }
 
@@ -274,11 +296,14 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from,
  * Returns the page unlocked, but with a reference.
  */
 static int ll_writepage(struct page *page) {
-        struct inode *inode = page->mapping->host; int err; ENTRY;
+        struct inode *inode = page->mapping->host;
+        int err;
+        ENTRY;
 
         LASSERT(PageLocked(page));
 
         /* XXX need to make sure we have LDLM lock on this page */
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         err = ll_brw(OBD_BRW_WRITE, inode, page, 1);
         if (err)
                 CERROR("ll_brw failure %d\n", err);
@@ -319,6 +344,7 @@ static int ll_commit_write(struct file *file, struct page *page,
         if (!PageLocked(page))
                 LBUG();
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         CDEBUG(D_INODE, "commit_page writing (off "LPD64"), count %d\n",
                pg.off, pg.count);
 
@@ -342,21 +368,23 @@ static int ll_commit_write(struct file *file, struct page *page,
         RETURN(rc);
 } /* ll_commit_write */
 
-
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
                         unsigned long blocknr, int blocksize)
 {
-        obd_count bufs_per_obdo = iobuf->nr_pages;
         struct ll_inode_info *lli = ll_i2info(inode);
         struct lov_stripe_md *lsm = lli->lli_smd;
         struct brw_page *pga;
         struct obd_brw_set *set;
-        int i, rc = 0;
+        loff_t offset;
+        int length, i, flags, rc = 0;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (!lsm || !lsm->lsm_object_id)
                 RETURN(-ENOMEM);
 
+        /* XXX Keep here until we find ia64 problem, it crashes otherwise */
         if (blocksize != PAGE_SIZE) {
                 CERROR("direct_IO blocksize != PAGE_SIZE\n");
                 RETURN(-EINVAL);
@@ -366,39 +394,56 @@ static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
         if (set == NULL)
                 RETURN(-ENOMEM);
 
-        OBD_ALLOC(pga, sizeof(*pga) * bufs_per_obdo);
+        OBD_ALLOC(pga, sizeof(*pga) * iobuf->nr_pages);
         if (!pga) {
                 obd_brw_set_free(set);
                 RETURN(-ENOMEM);
         }
 
-        /* NB: we can't use iobuf->maplist[i]->index for the offset
-         * instead of "blocknr" because ->index contains garbage.
-         */
-        for (i = 0; i < bufs_per_obdo; i++, blocknr++) {
+        CDEBUG(D_PAGE, "blocksize %u, blocknr %lu, iobuf %p: nr_pages %u, "
+                       "array_len %u, offset %u, length %u\n",
+               blocksize, blocknr, iobuf, iobuf->nr_pages,
+               iobuf->array_len, iobuf->offset, iobuf->length);
+
+        flags = (rw == WRITE ? OBD_BRW_CREATE : 0) /* | OBD_BRW_DIRECTIO */;
+        offset = (blocknr << inode->i_blkbits) /* + iobuf->offset? */;
+        length = iobuf->length;
+
+        for (i = 0, length = iobuf->length; length > 0;
+             length -= pga[i].count, offset += pga[i].count, i++) { /*i last!*/
                 pga[i].pg = iobuf->maplist[i];
-                pga[i].count = PAGE_SIZE;
-                pga[i].off = (obd_off)blocknr << PAGE_SHIFT;
-                pga[i].flag = OBD_BRW_CREATE;
+                pga[i].off = offset;
+                /* To the end of the page, or the length, whatever is less */
+                pga[i].count = min_t(int, PAGE_SIZE - (offset & ~PAGE_MASK),
+                                     length);
+                pga[i].flag = flags;
+                CDEBUG(D_PAGE, "page %d (%p), offset "LPU64", count %u\n",
+                       i, pga[i].pg, pga[i].off, pga[i].count);
+                if (rw == READ) {
+                        //POISON(kmap(iobuf->maplist[i]), 0xc5, PAGE_SIZE);
+                        //kunmap(iobuf->maplist[i]);
+                }
         }
 
         set->brw_callback = ll_brw_sync_wait;
         rc = obd_brw(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
-                     ll_i2obdconn(inode), lsm, bufs_per_obdo, pga, set, NULL);
-        if (rc)
-                CERROR("error from obd_brw: rc = %d\n", rc);
-        else {
+                     ll_i2obdconn(inode), lsm, iobuf->nr_pages, pga, set, NULL);
+        if (rc) {
+                CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
+                       "error from obd_brw: rc = %d\n", rc);
+        } else {
                 rc = ll_brw_sync_wait(set, CB_PHASE_START);
                 if (rc)
                         CERROR("error from callback: rc = %d\n", rc);
         }
         obd_brw_set_free(set);
         if (rc == 0)
-                rc = bufs_per_obdo * PAGE_SIZE;
+                rc = iobuf->length;
 
-        OBD_FREE(pga, sizeof(*pga) * bufs_per_obdo);
+        OBD_FREE(pga, sizeof(*pga) * iobuf->nr_pages);
         RETURN(rc);
 }
+#endif
 
 int ll_flush_inode_pages(struct inode * inode)
 {
@@ -410,9 +455,11 @@ int ll_flush_inode_pages(struct inode * inode)
 
         ENTRY;
 
+#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
         spin_lock(&pagecache_lock);
 
         spin_unlock(&pagecache_lock);
+#endif
 
 
         OBD_ALLOC(count, sizeof(*count) * bufs_per_obdo);
@@ -441,17 +488,18 @@ int ll_flush_inode_pages(struct inode * inode)
         RETURN(err);
 }
 
-#endif
+//#endif
 
 
 struct address_space_operations ll_aops = {
         readpage: ll_readpage,
 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
         direct_IO: ll_direct_IO,
+#endif
         writepage: ll_writepage,
         sync_page: block_sync_page,
         prepare_write: ll_prepare_write,
         commit_write: ll_commit_write,
         bmap: NULL
-#endif
+//#endif
 };
index 8df74f1..613c42f 100644 (file)
@@ -40,7 +40,7 @@ extern struct address_space_operations ll_dir_aops;
 struct super_operations ll_super_operations;
 
 /* /proc/lustre/llite root that tracks llite mount points */
-struct proc_dir_entry *proc_lustre_fs_root;
+struct proc_dir_entry *proc_lustre_fs_root = NULL;
 /* lproc_llite.c */
 extern int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
                                        struct super_block *sb,
@@ -131,6 +131,7 @@ static struct super_block *ll_read_super(struct super_block *sb,
 
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         OBD_ALLOC(sbi, sizeof(*sbi));
         if (!sbi)
                 RETURN(NULL);
@@ -271,6 +272,7 @@ static void ll_put_super(struct super_block *sb)
         struct ll_fid rootfid;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         list_del(&sbi->ll_conn_chain);
         ll_commitcbd_cleanup(sbi);
         obd_disconnect(&sbi->ll_osc_conn);
@@ -309,6 +311,7 @@ static void ll_clear_inode(struct inode *inode)
         int rc;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         rc = mdc_cancel_unused(&sbi->ll_mdc_conn, inode, LDLM_FL_NO_CALLBACK);
         if (rc < 0) {
                 CERROR("mdc_cancel_unused: %d\n", rc);
@@ -343,6 +346,7 @@ static void ll_clear_inode(struct inode *inode)
 static void ll_delete_inode(struct inode *inode)
 {
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (S_ISREG(inode->i_mode)) {
                 int err;
                 struct obdo *oa;
@@ -454,10 +458,59 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
         RETURN(err);
 }
 
+int ll_setattr_raw(struct inode *inode, struct iattr *attr)
+{
+        struct ptlrpc_request *request = NULL;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        int err = 0;
+        ENTRY;
+
+        if ((attr->ia_valid & ATTR_SIZE)) {
+                err = vmtruncate(inode, attr->ia_size);
+                if (err)
+                        RETURN(err);
+        }
+
+        /* Don't send size changes to MDS to avoid "fast EA" problems, and
+         * also avoid a pointless RPC (we get file size from OST anyways).
+         */
+        attr->ia_valid &= ~ATTR_SIZE;
+        if (!attr->ia_valid)
+                RETURN(0);
+
+        err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, NULL, 0,
+                          &request);
+        if (err)
+                CERROR("mdc_setattr fails: err = %d\n", err);
+
+        ptlrpc_req_finished(request);
+
+        if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) {
+                struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
+                struct obdo oa;
+                int err2;
+
+                CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n",
+                       inode->i_ino, attr->ia_mtime);
+                oa.o_id = lsm->lsm_object_id;
+                oa.o_mode = S_IFREG;
+                oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMTIME;
+                oa.o_mtime = attr->ia_mtime;
+                err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL);
+                if (err2) {
+                        CERROR("obd_setattr fails: rc=%d\n", err);
+                        if (!err)
+                                err = err2;
+                }
+        }
+        RETURN(err);
+}
+
 int ll_setattr(struct dentry *de, struct iattr *attr)
 {
         int rc = inode_change_ok(de->d_inode, attr);
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (rc)
                 return rc;
 
@@ -471,6 +524,7 @@ static int ll_statfs(struct super_block *sb, struct statfs *sfs)
         int rc;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         memset(sfs, 0, sizeof(*sfs));
         rc = obd_statfs(&sbi->ll_mdc_conn, &osfs);
         statfs_unpack(sfs, &osfs);
@@ -557,8 +611,8 @@ static void ll_read_inode2(struct inode *inode, void *opaque)
         struct ll_inode_info *lli = ll_i2info(inode);
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         sema_init(&lli->lli_open_sem, 1);
-        atomic_set(&lli->lli_open_count, 0);
 
         LASSERT(!lli->lli_smd);
 
@@ -620,6 +674,7 @@ void ll_umount_begin(struct super_block *sb)
         struct list_head *ctmp;
 
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
 
         list_for_each(ctmp, &sbi->ll_conn_chain) {
                 struct ptlrpc_connection *conn;
index fad4a4d..f77fdea 100644 (file)
 #include <linux/lprocfs_status.h>
 
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
 kmem_cache_t *ll_file_data_slab;
 extern struct address_space_operations ll_aops;
 extern struct address_space_operations ll_dir_aops;
 struct super_operations ll_super_operations;
 
 /* /proc/lustre/llite root that tracks llite mount points */
-struct proc_dir_entry *proc_lustre_fs_root;
+struct proc_dir_entry *proc_lustre_fs_root = NULL;
 /* lproc_llite.c */
 extern int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
                                        struct super_block *sb,
@@ -53,7 +54,7 @@ extern int ll_commitcbd_setup(struct ll_sb_info *);
 extern int ll_commitcbd_cleanup(struct ll_sb_info *);
 int ll_read_inode2(struct inode *inode, void *opaque);
 
-extern int ll_proc_namespace(struct super_block* sb, char* osc, char* mdc)
+extern int ll_proc_namespace(struct super_block* sb, char* osc, char* mdc);
 
 static char *ll_read_opt(const char *opt, char *data)
 {
@@ -131,6 +132,7 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
         struct ptlrpc_connection *mdc_conn;
         struct ll_read_inode2_cookie lic;
         class_uuid_t uuid;
+        struct obd_uuid param_uuid;
 
         ENTRY;
 
@@ -139,8 +141,9 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
                 RETURN(-ENOMEM);
 
         INIT_LIST_HEAD(&sbi->ll_conn_chain);
+        INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
         generate_random_uuid(uuid);
-        class_uuid_unparse(uuid, sbi->ll_sb_uuid);
+        class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
 
         sb->s_fs_info = sbi;
 
@@ -156,13 +159,14 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
                 GOTO(out_free, sb = NULL);
         }
 
-        obd = class_uuid2obd(mdc);
+        strncpy(param_uuid.uuid, mdc, sizeof(param_uuid.uuid));
+        obd = class_uuid2obd(&param_uuid);
         if (!obd) {
                 CERROR("MDC %s: not setup or attached\n", mdc);
                 GOTO(out_free, sb = NULL);
         }
 
-        err = obd_connect(&sbi->ll_mdc_conn, obd, sbi->ll_sb_uuid,
+        err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid,
                           ptlrpc_recovd, ll_recover);
         if (err) {
                 CERROR("cannot connect to %s: rc = %d\n", mdc, err);
@@ -178,7 +182,7 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
                 GOTO(out_mdc, sb = NULL);
         }
 
-        err = obd_connect(&sbi->ll_osc_conn, obd, sbi->ll_sb_uuid,
+        err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid,
                           ptlrpc_recovd, ll_recover);
         if (err) {
                 CERROR("cannot connect to %s: rc = %d\n", osc, err);
@@ -194,7 +198,7 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
         sbi->ll_rootino = rootfid.id;
 
         memset(&osfs, 0, sizeof(osfs));
-        err = mdc_statfs(&sbi->ll_mdc_conn, &osfs);
+        err = obd_statfs(&sbi->ll_mdc_conn, &osfs);
         sb->s_blocksize = osfs.os_bsize;
         sb->s_blocksize_bits = log2(osfs.os_bsize);
         sb->s_magic = LL_SUPER_MAGIC;
@@ -228,6 +232,7 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
 
         if (root) {
                 sb->s_root = d_alloc_root(root);
+                root->i_state &= ~(I_LOCK | I_NEW);
         } else {
                 CERROR("lustre_lite: bad iget4 for root\n");
                 GOTO(out_cdb, sb = NULL);
@@ -273,6 +278,7 @@ struct super_block * ll_get_sb(struct file_system_type *fs_type,
 static void ll_put_super(struct super_block *sb)
 {
         struct ll_sb_info *sbi = ll_s2sbi(sb);
+        struct list_head *tmp, *next;
         struct ll_fid rootfid;
         ENTRY;
 
@@ -294,6 +300,14 @@ static void ll_put_super(struct super_block *sb)
         }
 
         obd_disconnect(&sbi->ll_mdc_conn);
+
+        spin_lock(&dcache_lock);
+        list_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list){
+                struct dentry *dentry = list_entry(tmp, struct dentry, d_hash);
+                shrink_dcache_parent(dentry);
+        }
+        spin_unlock(&dcache_lock);
+
         OBD_FREE(sbi, sizeof(*sbi));
 
         EXIT;
@@ -338,6 +352,7 @@ static void ll_clear_inode(struct inode *inode)
         EXIT;
 }
 
+#if 0
 static void ll_delete_inode(struct inode *inode)
 {
         ENTRY;
@@ -374,6 +389,7 @@ out:
         clear_inode(inode);
         EXIT;
 }
+#endif
 
 /* like inode_setattr, but doesn't mark the inode dirty */
 static int ll_attr2inode(struct inode * inode, struct iattr * attr, int trunc)
@@ -423,7 +439,8 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
          */
         attr->ia_valid &= ~ATTR_SIZE;
         if (attr->ia_valid) {
-                err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request);
+                err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, NULL, 0,
+                                  &request);
                 if (err)
                         CERROR("mdc_setattr fails: err = %d\n", err);
 
@@ -431,18 +448,18 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
                 if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) {
                         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
                         struct obdo oa;
-                        int err;
+                        int err2;
 
                         CDEBUG(D_ERROR, "setting mtime on OST\n");
                         oa.o_id = lsm->lsm_object_id;
                         oa.o_mode = S_IFREG;
                         oa.o_valid = OBD_MD_FLID |OBD_MD_FLTYPE |OBD_MD_FLMTIME;
-                        oa.o_mtime = attr->ia_mtime;
-                        err = obd_setattr(&sbi->ll_osc_conn, &oa, lsm);
-                        if (err) {
+                        oa.o_mtime = attr->ia_mtime.tv_sec;
+                        err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL);
+                        if (err2) {
                                 CERROR("obd_setattr fails: rc=%d\n", err);
-                                if (!rc)
-                                        rc = err;
+                                if (!err)
+                                        err = err2;
                         }
                 }
         }
@@ -519,11 +536,11 @@ void ll_update_inode(struct inode *inode, struct mds_body *body,
         if (body->valid & OBD_MD_FLID)
                 inode->i_ino = body->ino;
         if (body->valid & OBD_MD_FLATIME)
-                inode->i_atime = body->atime;
+                inode->i_atime.tv_sec = body->atime;
         if (body->valid & OBD_MD_FLMTIME)
-                inode->i_mtime = body->mtime;
+                inode->i_mtime.tv_sec = body->mtime;
         if (body->valid & OBD_MD_FLCTIME)
-                inode->i_ctime = body->ctime;
+                inode->i_ctime.tv_sec = body->ctime;
         if (body->valid & OBD_MD_FLMODE)
                 inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
         if (body->valid & OBD_MD_FLTYPE)
@@ -624,7 +641,7 @@ void ll_umount_begin(struct super_block *sb)
 
                 spin_lock(&conn->c_lock);
                 conn->c_flags |= CONN_INVALID;
-                invalidate_request_list(&conn->c_sending_head);
+                /*invalidate_request_list(&conn->c_sending_head);*/
                 invalidate_request_list(&conn->c_delayed_head);
                 spin_unlock(&conn->c_lock);
         }
@@ -687,17 +704,18 @@ struct super_operations ll_super_operations =
         alloc_inode: ll_alloc_inode,
         destroy_inode: ll_destroy_inode,
         clear_inode: ll_clear_inode,
-        delete_inode: ll_delete_inode,
+//        delete_inode: ll_delete_inode,
         put_super: ll_put_super,
         statfs: ll_statfs,
         umount_begin: ll_umount_begin
 };
 
+
 struct file_system_type lustre_lite_fs_type = {
         .owner  = THIS_MODULE,
         .name =   "lustre_lite",
         .get_sb = ll_get_sb,
-        .kill_sb = kill_litter_super,
+        .kill_sb = kill_anon_super,
 };
 
 static int __init init_lustre_lite(void)
index 3c9d646..6ebe7de 100644 (file)
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/smp_lock.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
+#endif
 #define DEBUG_SUBSYSTEM S_LLITE
 
 #include <linux/lustre_lite.h>
@@ -69,6 +73,7 @@ static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
         int rc;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         /* on symlinks lli_open_sem protects lli_symlink_name allocation/data */
         down(&lli->lli_open_sem);
         rc = ll_readlink_internal(inode, &request, &symname);
@@ -83,6 +88,7 @@ static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
         RETURN(rc);
 }
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 static int ll_follow_link(struct dentry *dentry, struct nameidata *nd,
                           struct lookup_intent *it)
 {
@@ -93,6 +99,7 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd,
         char *symname;
         ENTRY;
 
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (it != NULL) {
                 op = it->it_op;
                 mode = it->it_mode;
@@ -117,12 +124,47 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd,
 
         RETURN(rc);
 }
+#else
+static int ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct ptlrpc_request *request;
+        int op = 0, mode = 0, rc;
+        char *symname;
+        ENTRY;
+
+        op = nd->it.it_op;
+        mode = nd->it.it_mode;
+
+        ll_intent_release(dentry, &nd->it);
+
+        down(&lli->lli_open_sem);
+
+        rc = ll_readlink_internal(inode, &request, &symname);
+        if (rc)
+                GOTO(out, rc);
+
+        nd->it.it_op = op;
+        nd->it.it_mode = mode;
+
+        rc = vfs_follow_link(nd, symname);
+ out:
+        up(&lli->lli_open_sem);
+        ptlrpc_req_finished(request);
+
+        RETURN(rc);
+}
+#endif
 
 extern int ll_inode_revalidate(struct dentry *dentry);
 extern int ll_setattr(struct dentry *de, struct iattr *attr);
 struct inode_operations ll_fast_symlink_inode_operations = {
         readlink:       ll_readlink,
         setattr:        ll_setattr,
+        setattr_raw:    ll_setattr_raw,
         follow_link2:   ll_follow_link,
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         revalidate:     ll_inode_revalidate
+#endif
 };
index ee4ac75..b626046 100644 (file)
@@ -9,7 +9,10 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/sysctl.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #include <linux/swapctl.h>
+#endif
 #include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/stat.h>
index 2070b01..6b647b4 100644 (file)
@@ -5,12 +5,18 @@
 
 DEFS=
 
+if LIBLUSTRE
+lib_LIBRARIES = liblov.a
+LINX=client.c
+liblov_a_SOURCES = lov_obd.c lov_pack.c $(LINX)
+else
 MODULE = lov
 modulefs_DATA = lov.o
 EXTRA_PROGRAMS = lov
 LINX=client.c
-
 lov_SOURCES = lov_obd.c lov_pack.c lproc_lov.c $(LINX)
+endif
+
 
 client.c: 
        test -e client.c || ln -sf $(top_srcdir)/lib/client.c
index 3e6b2d2..0e7ad82 100644 (file)
 
 #define EXPORT_SYMTAB
 #define DEBUG_SUBSYSTEM S_LOV
-
+#ifdef __KERNEL__
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <asm/div64.h>
+#else
+#include <liblustre.h>
+#endif
+
 #include <linux/obd_support.h>
 #include <linux/lustre_lib.h>
 #include <linux/lustre_net.h>
 #include <linux/lustre_mds.h>
 #include <linux/obd_class.h>
 #include <linux/obd_lov.h>
-#include <linux/init.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <asm/div64.h>
 #include <linux/lprocfs_status.h>
 
-
 static kmem_cache_t *lov_file_cache;
 
 struct lov_file_handles {
         struct list_head lfh_list;
         __u64 lfh_cookie;
         int lfh_count;
-        struct lustre_handle *lfh_handles;
+        char *lfh_data; /* an array of opaque data saved on behalf of
+                        * each osc, FD_OSTDATA_SIZE bytes for each */
 };
 
 struct lov_lock_handles {
@@ -87,10 +91,10 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
         struct lov_obd *lov = &obd->u.lov;
         struct client_obd *mdc = &lov->mdcobd->u.cli;
         struct lov_desc *desc = &lov->desc;
+        struct lov_tgt_desc *tgts;
         struct obd_export *exp;
         struct lustre_handle mdc_conn;
         struct obd_uuid lov_mds_uuid = {"LOV_MDS_UUID"};
-        struct obd_uuid uuid;
         char *tmp;
         int rc, rc2, i;
         ENTRY;
@@ -138,7 +142,8 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
         memcpy(desc, lustre_msg_buf(req->rq_repmsg, 0), sizeof(*desc));
         lov_unpackdesc(desc);
 
-        if (req->rq_repmsg->buflens[1] < sizeof(uuid.uuid)*desc->ld_tgt_count){
+        if (req->rq_repmsg->buflens[1] <
+            sizeof(desc->ld_uuid.uuid) * desc->ld_tgt_count){
                 CERROR("LOV desc: invalid uuid array returned\n");
                 GOTO(out_conn, rc = -EINVAL);
         }
@@ -178,44 +183,44 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
         }
 
         tmp = lustre_msg_buf(req->rq_repmsg, 1);
-        for (i = 0; i < desc->ld_tgt_count; i++) {
-                struct obd_device *tgt;
+        for (i = 0, tgts = lov->tgts; i < desc->ld_tgt_count; i++, tgts++) {
+                struct obd_uuid *uuid = &tgts->uuid;
+                struct obd_device *tgt_obd;
                 struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
 
-                strncpy(uuid.uuid, tmp, sizeof(uuid.uuid));
-                memcpy(&lov->tgts[i].uuid, &uuid, sizeof(uuid));
-                tgt = client_tgtuuid2obd(&uuid);
-                tmp += sizeof(uuid.uuid);
+                obd_str2uuid(uuid, tmp);
+                tgt_obd = client_tgtuuid2obd(uuid);
+                tmp += sizeof(uuid->uuid);
 
-                if (!tgt) {
-                        CERROR("Target %s not attached\n", uuid.uuid);
+                if (!tgt_obd) {
+                        CERROR("Target %s not attached\n", uuid->uuid);
                         GOTO(out_disc, rc = -EINVAL);
                 }
 
-                if (!(tgt->obd_flags & OBD_SET_UP)) {
-                        CERROR("Target %s not set up\n", uuid.uuid);
+                if (!(tgt_obd->obd_flags & OBD_SET_UP)) {
+                        CERROR("Target %s not set up\n", uuid->uuid);
                         GOTO(out_disc, rc = -EINVAL);
                 }
 
-                rc = obd_connect(&lov->tgts[i].conn, tgt, &lov_osc_uuid, recovd,
+                rc = obd_connect(&tgts->conn, tgt_obd, &lov_osc_uuid, recovd,
                                  recover);
 
                 if (rc) {
-                        CERROR("Target %s connect error %d\n", uuid.uuid,
-                               rc);
+                        CERROR("Target %s connect error %d\n", uuid->uuid, rc);
                         GOTO(out_disc, rc);
                 }
 
-                rc = obd_iocontrol(IOC_OSC_REGISTER_LOV, &lov->tgts[i].conn,
-                                    sizeof(struct obd_device *), obd, NULL);
+                rc = obd_iocontrol(IOC_OSC_REGISTER_LOV, &tgts->conn,
+                                   sizeof(struct obd_device *), obd, NULL);
                 if (rc) {
                         CERROR("Target %s REGISTER_LOV error %d\n",
-                               uuid.uuid, rc);
+                               uuid->uuid, rc);
+                        obd_disconnect(&tgts->conn);
                         GOTO(out_disc, rc);
                 }
 
                 desc->ld_active_tgt_count++;
-                lov->tgts[i].active = 1;
+                tgts->active = 1;
         }
 
         mdc->cl_max_mds_easize = obd_size_wiremd(conn, NULL);
@@ -225,12 +230,13 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
         RETURN(rc);
 
  out_disc:
-        i--; /* skip failed-connect OSC */
         while (i-- > 0) {
-                desc->ld_active_tgt_count--;
-                lov->tgts[i].active = 0;
-                memcpy(&uuid, &lov->tgts[i].uuid, sizeof(uuid));
-                rc2 = obd_disconnect(&lov->tgts[i].conn);
+                struct obd_uuid uuid;
+                --tgts;
+                --desc->ld_active_tgt_count;
+                tgts->active = 0;
+                obd_str2uuid(&uuid, tgts->uuid.uuid);
+                rc2 = obd_disconnect(&tgts->conn);
                 if (rc2)
                         CERROR("error: LOV target %s disconnect on OST idx %d: "
                                "rc = %d\n", uuid.uuid, i, rc2);
@@ -284,9 +290,8 @@ static int lov_disconnect(struct lustre_handle *conn)
                 CERROR("discarding open LOV handle %p:"LPX64"\n",
                        lfh, lfh->lfh_cookie);
                 list_del(&lfh->lfh_list);
-                OBD_FREE(lfh->lfh_handles,
-                         lfh->lfh_count * sizeof(*lfh->lfh_handles));
-                kmem_cache_free(lov_file_cache, lfh);
+                OBD_FREE(lfh->lfh_data, lfh->lfh_count * FD_OSTDATA_SIZE);
+                PORTAL_SLAB_FREE(lfh, lov_file_cache, sizeof(*lfh));
         }
         spin_unlock(&exp->exp_lov_data.led_lock);
 
@@ -538,6 +543,8 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa,
                 if (!lsm_new)
                         GOTO(out_cleanup, rc = -ENOMEM);
                 memcpy(lsm_new, lsm, size);
+                lsm_new->lsm_stripe_count = obj_alloc;
+
                 /* XXX LOV STACKING call into osc for sizes */
                 OBD_FREE(lsm, lov_stripe_md_size(lsm->lsm_stripe_count));
                 lsm = lsm_new;
@@ -609,8 +616,9 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa,
                 memcpy(&tmp, oa, sizeof(tmp));
                 tmp.o_id = loi->loi_id;
                 if (lfh)
-                        memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
-                               sizeof(lfh->lfh_handles[i]));
+                        memcpy(obdo_handle(&tmp),
+                               lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                               FD_OSTDATA_SIZE);
                 else
                         tmp.o_valid &= ~OBD_MD_FLHANDLE;
                 err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, &tmp,
@@ -722,8 +730,9 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
                 memcpy(&tmp, oa, sizeof(tmp));
                 tmp.o_id = loi->loi_id;
                 if (lfh)
-                        memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
-                               sizeof(lfh->lfh_handles[i]));
+                        memcpy(obdo_handle(&tmp),
+                               lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                               FD_OSTDATA_SIZE);
                 else
                         tmp.o_valid &= ~OBD_MD_FLHANDLE;
 
@@ -794,8 +803,9 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
                 obdo_cpy_md(tmp, oa, oa->o_valid);
 
                 if (lfh)
-                        memcpy(obdo_handle(tmp), &lfh->lfh_handles[i],
-                               sizeof(lfh->lfh_handles[i]));
+                        memcpy(obdo_handle(tmp),
+                               lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                               FD_OSTDATA_SIZE);
                 else
                         tmp->o_valid &= ~OBD_MD_FLHANDLE;
 
@@ -852,12 +862,11 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa,
         if (!tmp)
                 RETURN(-ENOMEM);
 
-        lfh = kmem_cache_alloc(lov_file_cache, GFP_KERNEL);
+        PORTAL_SLAB_ALLOC(lfh, lov_file_cache, sizeof(*lfh));
         if (!lfh)
                 GOTO(out_tmp, rc = -ENOMEM);
-        OBD_ALLOC(lfh->lfh_handles,
-                  lsm->lsm_stripe_count * sizeof(*lfh->lfh_handles));
-        if (!lfh->lfh_handles)
+        OBD_ALLOC(lfh->lfh_data, lsm->lsm_stripe_count * FD_OSTDATA_SIZE);
+        if (!lfh->lfh_data)
                 GOTO(out_lfh, rc = -ENOMEM);
 
         lov = &export->exp_obd->u.lov;
@@ -876,21 +885,20 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa,
                 rc = obd_open(&lov->tgts[loi->loi_ost_idx].conn, tmp,
                               NULL, NULL);
                 if (rc) {
-                        if (lov->tgts[loi->loi_ost_idx].active) {
-                                CERROR("error: open objid "LPX64" subobj "LPX64
-                                       " on OST idx %d: rc = %d\n",
-                                       oa->o_id, lsm->lsm_oinfo[i].loi_id,
-                                       loi->loi_ost_idx, rc);
-                                goto out_handles;
-                        }
-                        continue;
+                        if (!lov->tgts[loi->loi_ost_idx].active)
+                                continue;
+                        CERROR("error: open objid "LPX64" subobj "LPX64
+                               " on OST idx %d: rc = %d\n",
+                               oa->o_id, lsm->lsm_oinfo[i].loi_id,
+                               loi->loi_ost_idx, rc);
+                        goto out_handles;
                 }
 
                 lov_merge_attrs(oa, tmp, tmp->o_valid, lsm, i, &set);
 
                 if (tmp->o_valid & OBD_MD_FLHANDLE)
-                        memcpy(&lfh->lfh_handles[i], obdo_handle(tmp),
-                               sizeof(lfh->lfh_handles[i]));
+                        memcpy(lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                               obdo_handle(tmp), FD_OSTDATA_SIZE);
         }
 
         handle = obdo_handle(oa);
@@ -920,8 +928,8 @@ out_handles:
 
                 memcpy(tmp, oa, sizeof(*tmp));
                 tmp->o_id = loi->loi_id;
-                memcpy(obdo_handle(tmp), &lfh->lfh_handles[i],
-                       sizeof(lfh->lfh_handles[i]));
+                memcpy(obdo_handle(tmp), lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                       FD_OSTDATA_SIZE);
 
                 err = obd_close(&lov->tgts[loi->loi_ost_idx].conn, tmp,
                                 NULL, NULL);
@@ -932,11 +940,9 @@ out_handles:
                 }
         }
 
-        OBD_FREE(lfh->lfh_handles,
-                 lsm->lsm_stripe_count * sizeof(*lfh->lfh_handles));
+        OBD_FREE(lfh->lfh_data, lsm->lsm_stripe_count * FD_OSTDATA_SIZE);
 out_lfh:
-        lfh->lfh_cookie = DEAD_HANDLE_MAGIC;
-        kmem_cache_free(lov_file_cache, lfh);
+        PORTAL_SLAB_FREE(lfh, lov_file_cache, sizeof(*lfh));
         goto out_tmp;
 }
 
@@ -981,8 +987,9 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa,
                 memcpy(&tmp, oa, sizeof(tmp));
                 tmp.o_id = loi->loi_id;
                 if (lfh)
-                        memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
-                               sizeof(lfh->lfh_handles[i]));
+                        memcpy(obdo_handle(&tmp),
+                               lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                               FD_OSTDATA_SIZE);
                 else
                         tmp.o_valid &= ~OBD_MD_FLHANDLE;
 
@@ -999,11 +1006,12 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa,
                 }
         }
         if (lfh) {
+                spin_lock(&export->exp_lov_data.led_lock);
                 list_del(&lfh->lfh_list);
-                OBD_FREE(lfh->lfh_handles,
-                         lsm->lsm_stripe_count * sizeof(*lfh->lfh_handles));
-                lfh->lfh_cookie = DEAD_HANDLE_MAGIC;
-                kmem_cache_free(lov_file_cache, lfh);
+                spin_unlock(&export->exp_lov_data.led_lock);
+
+                OBD_FREE(lfh->lfh_data, lsm->lsm_stripe_count*FD_OSTDATA_SIZE);
+                PORTAL_SLAB_FREE(lfh, lov_file_cache, sizeof(*lfh));
         }
 
         RETURN(rc);
@@ -1101,8 +1109,9 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa,
                 memcpy(&tmp, oa, sizeof(tmp));
                 tmp.o_id = loi->loi_id;
                 if (lfh)
-                        memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
-                               sizeof(lfh->lfh_handles[i]));
+                        memcpy(obdo_handle(&tmp),
+                               lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                               FD_OSTDATA_SIZE);
                 else
                         tmp.o_valid &= ~OBD_MD_FLHANDLE;
 
@@ -1287,8 +1296,9 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                 lockh->addr = (__u64)(unsigned long)lov_lockh;
                 lockh->cookie = lov_lockh->llh_cookie;
                 lov_lockhp = lov_lockh->llh_handles;
-        } else
+        } else {
                 lov_lockhp = lockh;
+        }
 
         lov = &export->exp_obd->u.lov;
         for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count;
@@ -1332,7 +1342,7 @@ out_locks:
                 struct lov_stripe_md submd;
                 int err;
 
-                if (lov_lockhp->addr == 0 ||
+                if (lov_lockhp->cookie == 0 ||
                     lov->tgts[loi->loi_ost_idx].active == 0)
                         continue;
 
@@ -1354,7 +1364,6 @@ out_locks:
                           sizeof(*lov_lockh->llh_handles) *
                           lsm->lsm_stripe_count);
         }
-        lockh->addr = 0;
         lockh->cookie = DEAD_HANDLE_MAGIC;
 
         RETURN(rc);
@@ -1403,7 +1412,7 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                 struct lov_stripe_md submd;
                 int err;
 
-                if (lov_lockhp->addr == 0) {
+                if (lov_lockhp->cookie == 0) {
                         CDEBUG(D_HA, "lov idx %d no lock?\n", loi->loi_ost_idx);
                         continue;
                 }
@@ -1431,7 +1440,6 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                           sizeof(*lov_lockh->llh_handles) *
                           lsm->lsm_stripe_count);
         }
-        lockh->addr = 0;
         lockh->cookie = DEAD_HANDLE_MAGIC;
 
         RETURN(rc);
@@ -1647,7 +1655,7 @@ struct obd_ops lov_obd_ops = {
         o_iocontrol:   lov_iocontrol
 };
 
-static int __init lov_init(void)
+int __init lov_init(void)
 {
         struct lprocfs_static_vars lvars;
         int rc;
@@ -1673,9 +1681,11 @@ static void __exit lov_exit(void)
         class_unregister_type(OBD_LOV_DEVICENAME);
 }
 
+#ifdef __KERNEL__
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver");
 MODULE_LICENSE("GPL");
 
 module_init(lov_init);
 module_exit(lov_exit);
+#endif
index 9dc4e03..463dd72 100644 (file)
@@ -24,6 +24,9 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LLITE
+#ifndef __KERNEL__
+#include <liblustre.h>
+#endif
 
 #include <linux/lustre_net.h>
 #include <linux/obd.h>
@@ -260,7 +263,9 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
                        lmm.lmm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
-        if (lmm.lmm_stripe_count > lov->desc.ld_tgt_count) {
+#if 0   /* the stripe_count/offset is "advisory", and it gets fixed later */
+        if (lmm.lmm_stripe_count > lov->desc.ld_tgt_count &&
+            lmm.lmm_stripe_count != 0xffffffff) {
                 CERROR("stripe count %u more than OST count %d\n",
                        lmm.lmm_stripe_count, lov->desc.ld_tgt_count);
                 RETURN(-EINVAL);
@@ -271,19 +276,20 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
                        lmm.lmm_stripe_offset, lov->desc.ld_tgt_count);
                 RETURN(-EINVAL);
         }
+#endif
         if (lmm.lmm_stripe_size & (PAGE_SIZE - 1)) {
                 CERROR("stripe size %u not multiple of %lu\n",
                        lmm.lmm_stripe_size, PAGE_SIZE);
                 RETURN(-EINVAL);
         }
-        if ((__u64)lmm.lmm_stripe_size * lmm.lmm_stripe_count > ~0UL) {
+        stripe_count = lov_get_stripecnt(lov, lmm.lmm_stripe_count);
+
+        if ((__u64)lmm.lmm_stripe_size * stripe_count > ~0UL) {
                 CERROR("stripe width %ux%u > %lu on 32-bit system\n",
                        lmm.lmm_stripe_size, (int)lmm.lmm_stripe_count, ~0UL);
                 RETURN(-EINVAL);
         }
 
-        stripe_count = lov_get_stripecnt(lov, lmm.lmm_stripe_count);
-
         /* XXX LOV STACKING call into osc for sizes */
         OBD_ALLOC(lsm, lov_stripe_md_size(stripe_count));
         if (!lsm)
index 648f80b..630148a 100644 (file)
  */
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
+#endif
 #include <linux/lprocfs_status.h>
 #include <linux/obd_class.h>
 
@@ -40,8 +44,10 @@ int rd_stripesize(char *page, char **start, off_t off, int count, int *eof,
                   void *data)
 {
         struct obd_device *dev = (struct obd_device *)data;
-        struct lov_desc *desc = &dev->u.lov.desc;
+        struct lov_desc *desc;
 
+        LASSERT(dev != NULL);
+        desc = &dev->u.lov.desc;
         *eof = 1;
         return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_size);
 }
@@ -50,8 +56,10 @@ int rd_stripeoffset(char *page, char **start, off_t off, int count, int *eof,
                     void *data)
 {
         struct obd_device *dev = (struct obd_device *)data;
-        struct lov_desc *desc = &dev->u.lov.desc;
+        struct lov_desc *desc;
 
+        LASSERT(dev != NULL);
+        desc = &dev->u.lov.desc;
         *eof = 1;
         return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_offset);
 }
@@ -60,8 +68,10 @@ int rd_stripetype(char *page, char **start, off_t off, int count, int *eof,
                   void *data)
 {
         struct obd_device* dev = (struct obd_device*)data;
-        struct lov_desc *desc = &dev->u.lov.desc;
+        struct lov_desc *desc;
 
+        LASSERT(dev != NULL);
+        desc = &dev->u.lov.desc;
         *eof = 1;
         return snprintf(page, count, "%u\n", desc->ld_pattern);
 }
@@ -70,8 +80,10 @@ int rd_stripecount(char *page, char **start, off_t off, int count, int *eof,
                    void *data)
 {
         struct obd_device *dev = (struct obd_device *)data;
-        struct lov_desc *desc = &dev->u.lov.desc;
+        struct lov_desc *desc;
 
+        LASSERT(dev != NULL);
+        desc = &dev->u.lov.desc;
         *eof = 1;
         return snprintf(page, count, "%u\n", desc->ld_default_stripe_count);
 }
@@ -80,8 +92,10 @@ int rd_numobd(char *page, char **start, off_t off, int count, int *eof,
               void *data)
 {
         struct obd_device *dev = (struct obd_device*)data;
-        struct lov_desc *desc = &dev->u.lov.desc;
+        struct lov_desc *desc;
 
+        LASSERT(dev != NULL);
+        desc = &dev->u.lov.desc;
         *eof = 1;
         return snprintf(page, count, "%u\n", desc->ld_tgt_count);
 
@@ -91,8 +105,10 @@ int rd_activeobd(char *page, char **start, off_t off, int count, int *eof,
                  void *data)
 {
         struct obd_device* dev = (struct obd_device*)data;
-        struct lov_desc *desc = &dev->u.lov.desc;
+        struct lov_desc *desc;
 
+        LASSERT(dev != NULL);
+        desc = &dev->u.lov.desc;
         *eof = 1;
         return snprintf(page, count, "%u\n", desc->ld_active_tgt_count);
 }
@@ -102,8 +118,13 @@ int rd_target(char *page, char **start, off_t off, int count, int *eof,
 {
         struct obd_device *dev = (struct obd_device*) data;
         int len = 0, i;
-        struct lov_obd *lov = &dev->u.lov;
-        struct lov_tgt_desc *tgts = lov->tgts;
+        struct lov_obd *lov;
+        struct lov_tgt_desc *tgts;
+        
+        LASSERT(dev != NULL);
+        lov = &dev->u.lov;
+        tgts = lov->tgts;
+        LASSERT(tgts != NULL);
 
         for (i = 0; i < lov->desc.ld_tgt_count; i++, tgts++) {
                 int cur;
@@ -120,8 +141,10 @@ int rd_target(char *page, char **start, off_t off, int count, int *eof,
 int rd_mdc(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
         struct obd_device *dev = (struct obd_device*) data;
-        struct lov_obd *lov = &dev->u.lov;
+        struct lov_obd *lov;
 
+        LASSERT(dev != NULL);
+        lov = &dev->u.lov;
         *eof = 1;
         return snprintf(page, count, "%s\n", lov->mdcobd->obd_uuid.uuid);
 }
index f5b5b80..3f81507 100644 (file)
  */
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
+#endif
 #include <linux/obd_class.h>
 #include <linux/lprocfs_status.h>
 
index 1fbd346..3553a45 100644 (file)
@@ -30,8 +30,7 @@
 #include <linux/obd_class.h>
 #include <linux/lustre_mds.h>
 
-extern struct semaphore mdc_sem;
-
+/* mdc_setattr does its own semaphore handling */
 static int mdc_reint(struct ptlrpc_request *request, int level)
 {
         int rc;
@@ -41,7 +40,6 @@ static int mdc_reint(struct ptlrpc_request *request, int level)
 
         if (!(*opcodeptr == REINT_SETATTR))
                 mdc_get_rpc_lock(&mdc_rpc_lock, NULL);
-
         rc = ptlrpc_queue_wait(request);
         if (!(*opcodeptr == REINT_SETATTR))
                 mdc_put_rpc_lock(&mdc_rpc_lock, NULL);
@@ -55,15 +53,24 @@ static int mdc_reint(struct ptlrpc_request *request, int level)
         return rc;
 }
 
+/* If mdc_setattr is called with an 'iattr', then it is a normal RPC that
+ * should take the normal semaphore and go to the normal portal.
+ *
+ * If it is called with iattr->ia_valid & ATTR_FROM_OPEN, then it is a
+ * magic open-path setattr that should take the setattr semaphore and
+ * go to the setattr portal. */
 int mdc_setattr(struct lustre_handle *conn, struct inode *inode,
                 struct iattr *iattr, void *ea, int ealen,
                 struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
         struct mds_rec_setattr *rec;
+        struct mdc_rpc_lock *rpc_lock;
         int rc, bufcount = 1, size[2] = {sizeof(*rec), ealen};
         ENTRY;
 
+        LASSERT(iattr != NULL);
+
         if (ealen > 0)
                 bufcount = 2;
 
@@ -72,15 +79,21 @@ int mdc_setattr(struct lustre_handle *conn, struct inode *inode,
         if (!req)
                 RETURN(-ENOMEM);
 
-        /* XXX FIXME bug 249 */
-        req->rq_request_portal = MDS_GETATTR_PORTAL;
+        if (iattr->ia_valid & ATTR_FROM_OPEN) {
+                req->rq_request_portal = MDS_SETATTR_PORTAL; //XXX FIXME bug 249
+                rpc_lock = &mdc_setattr_lock;
+        } else
+                rpc_lock = &mdc_rpc_lock;
 
         mds_setattr_pack(req, inode, iattr, ea, ealen);
 
         size[0] = sizeof(struct mds_body);
         req->rq_replen = lustre_msg_size(1, size);
 
+        mdc_get_rpc_lock(rpc_lock, NULL);
         rc = mdc_reint(req, LUSTRE_CONN_FULL);
+        mdc_put_rpc_lock(rpc_lock, NULL);
+
         *request = req;
         if (rc == -ERESTARTSYS)
                 rc = 0;
index 43bf5e8..68075f5 100644 (file)
@@ -35,6 +35,7 @@
 
 extern int mds_queue_req(struct ptlrpc_request *);
 struct mdc_rpc_lock mdc_rpc_lock;
+struct mdc_rpc_lock mdc_setattr_lock;
 EXPORT_SYMBOL(mdc_rpc_lock);
 
 /* Helper that implements most of mdc_getstatus and signal_completed_replay. */
@@ -132,9 +133,6 @@ int mdc_getattr(struct lustre_handle *conn,
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
-        /* XXX FIXME bug 249 */
-        req->rq_request_portal = MDS_GETATTR_PORTAL;
-
         body = lustre_msg_buf(req->rq_reqmsg, 0);
         ll_ino2fid(&body->fid1, ino, 0, type);
         body->valid = valid;
@@ -247,10 +245,9 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                 /* XXX what tells us that 'data' is a valid inode at all?
                  *     we should probably validate the lock handle first?
                  */
-
                 inode = igrab(inode);
 
-                if (inode == NULL)      /* inode->i_state & I_FREEING */
+                if (inode == NULL) /* inode->i_state & I_FREEING */
                         break;
 
                 if (S_ISDIR(inode->i_mode)) {
@@ -260,7 +257,8 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                         ll_invalidate_inode_pages(inode);
                 }
 
-                if (inode != inode->i_sb->s_root->d_inode)
+                if (inode->i_sb->s_root && 
+                    inode != inode->i_sb->s_root->d_inode)
                         d_unhash_aliases(inode);
 
                 iput(inode);
@@ -319,9 +317,15 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                 lit->opc = NTOH__u64((__u64)it->it_op);
 
                 /* pack the intended request */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
                 mds_open_pack(req, 2, dir, it->it_mode, 0, current->fsuid,
                               current->fsgid, CURRENT_TIME, it->it_flags,
                               de->d_name.name, de->d_name.len, tgt, tgtlen);
+#else
+                mds_open_pack(req, 2, dir, it->it_mode, 0, current->fsuid,
+                              current->fsgid, CURRENT_TIME.tv_sec, it->it_flags,
+                              de->d_name.name, de->d_name.len, tgt, tgtlen);
+#endif
                 req->rq_replen = lustre_msg_size(3, repsize);
         } else if (it->it_op & IT_UNLINK) {
                 size[2] = sizeof(struct mds_rec_unlink);
@@ -340,7 +344,7 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                                 d->unl_de, d->unl_mode,
                                 d->unl_name, d->unl_len);
                 req->rq_replen = lustre_msg_size(3, repsize);
-        } else if (it->it_op & (IT_GETATTR| IT_SETATTR | IT_LOOKUP)) {
+        } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
                 int valid = OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE;
                 size[2] = sizeof(struct mds_body);
                 size[3] = de->d_name.len + 1;
@@ -377,6 +381,7 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                               lock_type, NULL, 0, lock_mode, &flags,
                               ldlm_completion_ast, mdc_blocking_ast, dir, NULL,
                               lockh);
+        mdc_put_rpc_lock(&mdc_rpc_lock, it);
 
         /* If we successfully created, mark the request so that replay will
          * do the right thing */
@@ -391,8 +396,6 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                 lockreq->lock_flags |= LDLM_FL_INTENT_ONLY;
         }
 
-        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0);
-
         /* This can go when we're sure that this can never happen */
         LASSERT(rc != -ENOENT);
         if (rc == ELDLM_LOCK_ABORTED) {
@@ -417,7 +420,7 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                 /* The server almost certainly gave us a lock other than the
                  * one that we asked for.  If we already have a matching lock,
                  * then cancel this one--we don't need two. */
-                LDLM_DEBUG0(lock, "matching against this");
+                LDLM_DEBUG(lock, "matching against this");
 
                 memcpy(&lockh2, lockh, sizeof(lockh2));
                 if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
@@ -429,6 +432,7 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                 LDLM_LOCK_PUT(lock);
         }
 
+        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0);
         it->it_disposition = (int) dlm_rep->lock_policy_res1;
         it->it_status = (int) dlm_rep->lock_policy_res2;
         it->it_lock_mode = lock_mode;
@@ -487,6 +491,11 @@ static void mdc_replay_open(struct ptlrpc_request *req)
 
 void mdc_set_open_replay_data(struct ll_file_data *fd)
 {
+        struct ptlrpc_request *req = fd->fd_req;
+        struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, 2);
+        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1);
+
+        memcpy(&rec->cr_replayfid, &body->fid1, sizeof rec->cr_replayfid);
         fd->fd_req->rq_replay_cb = mdc_replay_open;
         fd->fd_req->rq_replay_data = &fd->fd_mdshandle;
 }
@@ -510,7 +519,9 @@ int mdc_close(struct lustre_handle *conn, obd_id ino, int type,
 
         req->rq_replen = lustre_msg_size(0, NULL);
 
+        mdc_get_rpc_lock(&mdc_rpc_lock, NULL);
         rc = ptlrpc_queue_wait(req);
+        mdc_put_rpc_lock(&mdc_rpc_lock, NULL);
 
         EXIT;
  out:
@@ -528,7 +539,6 @@ int mdc_readpage(struct lustre_handle *conn, obd_id ino, int type, __u64 offset,
         struct ptlrpc_bulk_desc *desc = NULL;
         struct ptlrpc_bulk_page *bulk = NULL;
         struct mds_body *body;
-        unsigned long flags;
         int rc, size = sizeof(*body);
         ENTRY;
 
@@ -542,13 +552,14 @@ int mdc_readpage(struct lustre_handle *conn, obd_id ino, int type, __u64 offset,
         if (!req)
                 GOTO(out2, rc = -ENOMEM);
 
+        /* XXX FIXME bug 249 */
+        req->rq_request_portal = MDS_READPAGE_PORTAL;
+
         bulk = ptlrpc_prep_bulk_page(desc);
         if (bulk == NULL)
                 GOTO(out2, rc = -ENOMEM);
 
-        spin_lock_irqsave(&imp->imp_lock, flags);
-        bulk->bp_xid = ++imp->imp_last_bulk_xid;
-        spin_unlock_irqrestore(&imp->imp_lock, flags);
+        bulk->bp_xid = ptlrpc_next_xid();
         bulk->bp_buflen = PAGE_CACHE_SIZE;
         bulk->bp_buf = addr;
 
@@ -684,10 +695,10 @@ static int mdc_recover(struct obd_import *imp, int phase)
                         if (rc)
                                 GOTO(check_rc, rc);
                 } else if (flags & MSG_CONNECT_RECONNECT) {
-                        DEBUG_REQ(D_HA, req, "reconnecting to MDS\n");
+                        DEBUG_REQ(D_HA, req, "reconnecting to MDS");
                         /* Nothing else to do here. */
                 } else {
-                        DEBUG_REQ(D_HA, req, "evicted: invalidating\n");
+                        DEBUG_REQ(D_HA, req, "evicted: invalidating");
                         /* Otherwise, clean everything up. */
                         ldlm_namespace_cleanup(ns, 1);
                         ptlrpc_abort_inflight(imp, 0);
@@ -696,7 +707,6 @@ static int mdc_recover(struct obd_import *imp, int phase)
                 ptlrpc_req_finished(req);
                 spin_lock_irqsave(&imp->imp_lock, flags);
                 imp->imp_level = LUSTRE_CONN_FULL;
-                imp->imp_flags &= ~IMP_INVALID;
                 spin_unlock_irqrestore(&imp->imp_lock, flags);
 
                 ptlrpc_wake_delayed(imp);
@@ -743,6 +753,7 @@ static int __init ptlrpc_request_init(void)
 {
         struct lprocfs_static_vars lvars;
         mdc_init_rpc_lock(&mdc_rpc_lock);
+        mdc_init_rpc_lock(&mdc_setattr_lock);
         lprocfs_init_vars(&lvars);
         return class_register_type(&mdc_obd_ops, lvars.module_vars,
                                    LUSTRE_MDC_NAME);
index 2be4362..58cfa20 100644 (file)
 #include <linux/init.h>
 #include <linux/obd_class.h>
 #include <linux/random.h>
-#include <linux/locks.h>
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
+#include <linux/mount.h>
+#else 
+#include <linux/locks.h>
 #endif
 #include <linux/obd_lov.h>
 #include <linux/lustre_mds.h>
@@ -50,9 +53,8 @@ kmem_cache_t *mds_file_cache;
 extern int mds_get_lovtgts(struct mds_obd *obd, int tgt_count,
                            struct obd_uuid *uuidarray);
 extern int mds_get_lovdesc(struct mds_obd  *obd, struct lov_desc *desc);
-extern void mds_start_transno(struct mds_obd *mds);
-extern int mds_finish_transno(struct mds_obd *mds, void *handle,
-                              struct ptlrpc_request *req, int rc);
+int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle,
+                       struct ptlrpc_request *req, int rc, int disp);
 static int mds_cleanup(struct obd_device * obddev);
 
 inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req)
@@ -232,7 +234,6 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
         return result;
 }
 
-static void mds_abort_recovery(void *data);
 
 /* Establish a connection to the MDS.
  *
@@ -247,7 +248,6 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
         struct obd_export *exp;
         struct mds_export_data *med;
         struct mds_client_data *mcd;
-        struct mds_obd *mds = &obd->u.mds;
         int rc;
         ENTRY;
 
@@ -255,10 +255,10 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
                 RETURN(-EINVAL);
 
         /* Check for aborted recovery. */
-        spin_lock_bh(&mds->mds_processing_task_lock);
+        spin_lock_bh(&obd->obd_processing_task_lock);
         if (obd->obd_flags & OBD_ABORT_RECOVERY)
-                mds_abort_recovery(mds);
-        spin_unlock_bh(&mds->mds_processing_task_lock);
+                target_abort_recovery(obd);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
 
         /* XXX There is a small race between checking the list and adding a
          * new connection for the same UUID, but the real threat (list
@@ -317,8 +317,10 @@ inline int mds_close_mfd(struct mds_file_data *mfd, struct mds_export_data *med)
         mfd->mfd_servercookie = DEAD_HANDLE_MAGIC;
         kmem_cache_free(mds_file_cache, mfd);
 
-        if (file->f_dentry->d_parent)
+        if (file->f_dentry->d_parent) {
+                LASSERT(atomic_read(&file->f_dentry->d_parent->d_count));
                 de = dget(file->f_dentry->d_parent);
+        }
         rc = filp_close(file, 0);
         if (de)
                 l_dput(de);
@@ -350,6 +352,13 @@ static int mds_disconnect(struct lustre_handle *conn)
         spin_unlock(&med->med_open_lock);
 
         ldlm_cancel_locks_for_export(export);
+        if (med->med_outstanding_reply) {
+                /* Fake the ack, so the locks get cancelled. */
+                med->med_outstanding_reply->rq_flags &= ~PTL_RPC_FL_WANT_ACK;
+                med->med_outstanding_reply->rq_flags |= PTL_RPC_FL_ERR;
+                wake_up(&med->med_outstanding_reply->rq_wait_for_rep);
+                med->med_outstanding_reply = NULL;
+        }
         mds_client_free(export);
 
         rc = class_disconnect(conn);
@@ -481,14 +490,14 @@ int mds_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                 struct lustre_handle lockh;
                 int rc;
 
-                LDLM_DEBUG0(lock, "already unused, calling ldlm_cli_cancel");
+                LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
                 ldlm_lock2handle(lock, &lockh);
                 rc = ldlm_cli_cancel(&lockh);
                 if (rc < 0)
                         CERROR("ldlm_cli_cancel: %d\n", rc);
         } else {
-                LDLM_DEBUG0(lock, "Lock still has references, will be "
-                            "cancelled later");
+                LDLM_DEBUG(lock, "Lock still has references, will be "
+                           "cancelled later");
         }
         RETURN(0);
 }
@@ -532,7 +541,7 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg,
                 rc = 0;
         }
 
-        return rc;
+        RETURN(rc);
 }
 
 static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
@@ -623,11 +632,68 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
         return(rc);
 }
 
+/* This is more copy-and-paste from getattr_name than I'd like. */
+static void reconstruct_getattr_name(int offset, struct ptlrpc_request *req,
+                                     struct lustre_handle *client_lockh)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct mds_client_data *mcd = med->med_mcd;
+        struct obd_device *obd = req->rq_export->exp_obd;
+        struct mds_obd *mds = mds_req2mds(req);
+        struct dentry *parent, *child;
+        struct mds_body *body;
+        struct inode *dir;
+        struct obd_run_ctxt saved;
+        struct obd_ucred uc;
+        int namelen, rc = 0;
+        char *name;
+
+        req->rq_transno = mcd->mcd_last_transno;
+        req->rq_status = mcd->mcd_last_result;
+
+        if (med->med_outstanding_reply)
+                mds_steal_ack_locks(med, req);
+
+        if (req->rq_status)
+                return;
+
+        body = lustre_msg_buf(req->rq_reqmsg, offset);
+        name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
+        namelen = req->rq_reqmsg->buflens[offset + 1];
+        /* requests were at offset 2, replies go back at 1 */
+        if (offset)
+                offset = 1;
+
+        uc.ouc_fsuid = body->fsuid;
+        uc.ouc_fsgid = body->fsgid;
+        uc.ouc_cap = body->capability;
+        uc.ouc_suppgid1 = body->suppgid;
+        uc.ouc_suppgid2 = -1;
+        push_ctxt(&saved, &mds->mds_ctxt, &uc);
+        parent = mds_fid2dentry(mds, &body->fid1, NULL);
+        LASSERT(!IS_ERR(parent));
+        dir = parent->d_inode;
+        LASSERT(dir);
+        child = lookup_one_len(name, parent, namelen - 1);
+        LASSERT(!IS_ERR(child));
+
+        if (!med->med_outstanding_reply) {
+                /* XXX need to enqueue client lock */
+                LBUG();
+        }
+
+        if (req->rq_repmsg == NULL)
+                mds_getattr_pack_msg(req, child->d_inode, offset);
+        
+        rc = mds_getattr_internal(obd, child, req, body, offset);
+        LASSERT(!rc);
+        l_dput(child);
+        l_dput(parent);
+}
+
 static int mds_getattr_name(int offset, struct ptlrpc_request *req,
                             struct lustre_handle *child_lockh)
 {
-        struct ldlm_intent *it = lustre_msg_buf(req->rq_reqmsg, 1);
-        int lock_mode;
         struct mds_obd *mds = mds_req2mds(req);
         struct obd_device *obd = req->rq_export->exp_obd;
         struct obd_run_ctxt saved;
@@ -637,15 +703,18 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req,
         struct obd_ucred uc;
         struct ldlm_res_id child_res_id = { .name = {0} };
         struct lustre_handle parent_lockh;
-        int namelen, flags = 0, rc = 0;
+        int namelen, flags = 0, rc = 0, cleanup_phase = 0;
         char *name;
         ENTRY;
 
         LASSERT(!strcmp(obd->obd_type->typ_name, "mds"));
 
+        MDS_CHECK_RESENT(req, 
+                         reconstruct_getattr_name(offset, req, child_lockh));
+
         if (req->rq_reqmsg->bufcount <= offset + 1) {
                 LBUG();
-                GOTO(out_pre_de, rc = -EINVAL);
+                GOTO(cleanup, rc = -EINVAL);
         }
 
         body = lustre_msg_buf(req->rq_reqmsg, offset);
@@ -658,58 +727,75 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req,
         uc.ouc_fsuid = body->fsuid;
         uc.ouc_fsgid = body->fsgid;
         uc.ouc_cap = body->capability;
-        uc.ouc_suppgid = body->suppgid;
+        uc.ouc_suppgid1 = body->suppgid;
+        uc.ouc_suppgid2 = -1;
         push_ctxt(&saved, &mds->mds_ctxt, &uc);
         /* Step 1: Lookup/lock parent */
         de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_PR,
                                    &parent_lockh);
         if (IS_ERR(de))
-                GOTO(out_pre_de, rc = PTR_ERR(de));
+                GOTO(cleanup, rc = PTR_ERR(de));
         dir = de->d_inode;
         LASSERT(dir);
 
+        cleanup_phase = 1; /* parent dentry and lock */
+
         CDEBUG(D_INODE, "parent ino %lu, name %*s\n", dir->i_ino,namelen,name);
 
         /* Step 2: Lookup child */
         dchild = lookup_one_len(name, de, namelen - 1);
         if (IS_ERR(dchild)) {
                 CDEBUG(D_INODE, "child lookup error %ld\n", PTR_ERR(dchild));
-                GOTO(out_step_1, rc = PTR_ERR(dchild));
-        } else if (dchild->d_inode == NULL) {
-                GOTO(out_step_2, rc = -ENOENT);
+                GOTO(cleanup, rc = PTR_ERR(dchild));
+        }
+
+        cleanup_phase = 2; /* child dentry */
+
+        if (dchild->d_inode == NULL) {
+                GOTO(cleanup, rc = -ENOENT);
         }
 
         /* Step 3: Lock child */
-        if (it->opc == IT_SETATTR)
-                lock_mode = LCK_PW;
-        else
-                lock_mode = LCK_PR;
         child_res_id.name[0] = dchild->d_inode->i_ino;
         child_res_id.name[1] = dchild->d_inode->i_generation;
         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL,
-                              child_res_id, LDLM_PLAIN, NULL, 0, lock_mode,
+                              child_res_id, LDLM_PLAIN, NULL, 0, LCK_PR,
                               &flags, ldlm_completion_ast, mds_blocking_ast,
                               NULL, NULL, child_lockh);
         if (rc != ELDLM_OK) {
                 CERROR("ldlm_cli_enqueue: %d\n", rc);
-                GOTO(out_step_2, rc = -EIO);
+                GOTO(cleanup, rc = -EIO);
         }
 
+        cleanup_phase = 3; /* child lock */
+
         if (req->rq_repmsg == NULL)
                 mds_getattr_pack_msg(req, dchild->d_inode, offset);
 
         rc = mds_getattr_internal(obd, dchild, req, body, offset);
-        if (rc)
-                GOTO(out_step_3, rc);
-        GOTO(out_step_2, rc); /* returns the lock to the client */
- out_step_3:
-        ldlm_lock_decref(child_lockh, LCK_PR);
- out_step_2:
-        l_dput(dchild);
- out_step_1:
-        ldlm_lock_decref(&parent_lockh, LCK_PR);
-        l_dput(de);
- out_pre_de:
+        GOTO(cleanup, rc); /* returns the lock to the client */
+        
+ cleanup:
+        rc = mds_finish_transno(mds, dchild ? dchild->d_inode : NULL, NULL,
+                                req, rc, 0);
+        switch (cleanup_phase) {
+        case 3:
+                if (rc)
+                        ldlm_lock_decref(child_lockh, LCK_PR);
+        case 2:
+                l_dput(dchild);
+
+        case 1:
+                if (rc) {
+                        ldlm_lock_decref(&parent_lockh, LCK_PR);
+                } else {
+                        memcpy(&req->rq_ack_locks[0].lock, &parent_lockh,
+                               sizeof(parent_lockh));
+                        req->rq_ack_locks[0].mode = LCK_PR;
+                }
+                l_dput(de);
+        default: ;
+        }
         req->rq_status = rc;
         pop_ctxt(&saved, &mds->mds_ctxt, &uc);
         return rc;
@@ -822,20 +908,14 @@ static int mds_store_md(struct mds_obd *mds, struct ptlrpc_request *req,
         uc.ouc_fsgid = body->fsgid;
         uc.ouc_cap = body->capability;
         push_ctxt(&saved, &mds->mds_ctxt, &uc);
-        mds_start_transno(mds);
         handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR);
         if (IS_ERR(handle)) {
                 rc = PTR_ERR(handle);
-                mds_finish_transno(mds, handle, req, rc);
                 GOTO(out_ea, rc);
         }
 
         rc = fsfilt_set_md(obd, inode,handle,lmm,lmm_size);
-        rc = mds_finish_transno(mds, handle, req, rc);
-
-        rc2 = fsfilt_commit(obd, inode, handle);
-        if (rc2 && !rc)
-                rc = rc2;
+        rc = mds_finish_transno(mds, inode, handle, req, rc, 0);
 out_ea:
         pop_ctxt(&saved, &mds->mds_ctxt, &uc);
 
@@ -844,6 +924,20 @@ out_ea:
 
 #endif
 
+static void reconstruct_close(struct ptlrpc_request *req)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct mds_client_data *mcd = med->med_mcd;
+
+        req->rq_transno = mcd->mcd_last_transno;
+        req->rq_status = mcd->mcd_last_result;
+
+        /* XXX When open-unlink is working, we'll need to steal ack locks as
+         * XXX well, and make sure that we do the right unlinking after we
+         * XXX get the ack back.
+         */
+}
+
 static int mds_close(struct ptlrpc_request *req)
 {
         struct mds_export_data *med = &req->rq_export->exp_mds_data;
@@ -852,6 +946,8 @@ static int mds_close(struct ptlrpc_request *req)
         int rc;
         ENTRY;
 
+        MDS_CHECK_RESENT(req, reconstruct_close(req));
+
         body = lustre_msg_buf(req->rq_reqmsg, 0);
 
         mfd = mds_handle2mfd(&body->handle);
@@ -956,199 +1052,8 @@ int mds_reint(struct ptlrpc_request *req, int offset,
         return rc;
 }
 
-/* forward declaration */
-int mds_handle(struct ptlrpc_request *req);
-
-static void abort_delayed_replies(struct mds_obd *mds)
-{
-        struct ptlrpc_request *req;
-        struct list_head *tmp, *n;
-        list_for_each_safe(tmp, n, &mds->mds_delayed_reply_queue) {
-                req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                DEBUG_REQ(D_ERROR, req, "aborted:");
-                req->rq_status = -ENOTCONN;
-                req->rq_type = PTL_RPC_MSG_ERR;
-                ptlrpc_reply(req->rq_svc, req);
-                list_del(&req->rq_list);
-                OBD_FREE(req, sizeof *req);
-        }
-}
-
-static void mds_abort_recovery(void *data)
-{
-        struct mds_obd *mds = data;
-        struct obd_device *obd = list_entry(mds, struct obd_device, u.mds);
-        CERROR("disconnecting clients and aborting recovery\n");
-        mds->mds_recoverable_clients = 0;
-        obd->obd_flags &= ~(OBD_RECOVERING | OBD_ABORT_RECOVERY);
-        abort_delayed_replies(mds);
-        spin_unlock_bh(&mds->mds_processing_task_lock);
-        class_disconnect_all(obd);
-        spin_lock_bh(&mds->mds_processing_task_lock);
-}
-
-static void mds_recovery_expired(unsigned long castmeharder)
-{
-        struct mds_obd *mds = (struct mds_obd *)castmeharder;
-        struct obd_device *obd = list_entry(mds, struct obd_device, u.mds);
-        CERROR("recovery timed out, aborting\n");
-        spin_lock_bh(&mds->mds_processing_task_lock);
-        obd->obd_flags |= OBD_ABORT_RECOVERY;
-        wake_up(&mds->mds_next_transno_waitq);
-        spin_unlock_bh(&mds->mds_processing_task_lock);
-}
-
-static void reset_recovery_timer(struct mds_obd *mds)
-{
-        CDEBUG(D_ERROR, "timer will expire in %ld seconds\n",
-               MDS_RECOVERY_TIMEOUT / HZ);
-        mod_timer(&mds->mds_recovery_timer, jiffies + MDS_RECOVERY_TIMEOUT);
-}
-
-static void start_recovery_timer(struct mds_obd *mds)
-{
-        mds->mds_recovery_timer.function = mds_recovery_expired;
-        mds->mds_recovery_timer.data = (unsigned long)mds;
-        init_timer(&mds->mds_recovery_timer);
-        reset_recovery_timer(mds);
-}
-
-static void cancel_recovery_timer(struct mds_obd *mds)
-{
-        del_timer(&mds->mds_recovery_timer);
-}
-
-static int check_for_next_transno(struct mds_obd *mds)
-{
-        struct ptlrpc_request *req;
-        struct obd_device *obd = list_entry(mds, struct obd_device, u.mds);
-        req = list_entry(mds->mds_recovery_queue.next,
-                         struct ptlrpc_request, rq_list);
-        LASSERT(req->rq_reqmsg->transno >= mds->mds_next_recovery_transno);
-
-        return req->rq_reqmsg->transno == mds->mds_next_recovery_transno ||
-                (obd->obd_flags & OBD_RECOVERING) == 0;
-}
-
-static void process_recovery_queue(struct mds_obd *mds)
-{
-        struct ptlrpc_request *req;
-        struct obd_device *obd = list_entry(mds, struct obd_device, u.mds);
-        int aborted = 0;
-        ENTRY;
-
-        for (;;) {
-                spin_lock_bh(&mds->mds_processing_task_lock);
-                LASSERT(mds->mds_processing_task == current->pid);
-                req = list_entry(mds->mds_recovery_queue.next,
-                                 struct ptlrpc_request, rq_list);
-
-                if (req->rq_reqmsg->transno != mds->mds_next_recovery_transno) {
-                        spin_unlock_bh(&mds->mds_processing_task_lock);
-                        CDEBUG(D_HA, "Waiting for transno "LPD64" (1st is "
-                               LPD64")\n",
-                               mds->mds_next_recovery_transno,
-                               req->rq_reqmsg->transno);
-                        wait_event(mds->mds_next_transno_waitq,
-                                   check_for_next_transno(mds));
-                        spin_lock_bh(&mds->mds_processing_task_lock);
-                        if (obd->obd_flags & OBD_ABORT_RECOVERY) {
-                                mds_abort_recovery(mds);
-                                aborted = 1;
-                        }
-                        spin_unlock_bh(&mds->mds_processing_task_lock);
-                        if (aborted)
-                                return;
-                        continue;
-                }
-                list_del_init(&req->rq_list);
-                spin_unlock_bh(&mds->mds_processing_task_lock);
-
-                DEBUG_REQ(D_ERROR, req, "processing: ");
-                (void)mds_handle(req);
-                reset_recovery_timer(mds);
-                mds_fsync_super(mds->mds_sb);
-                OBD_FREE(req, sizeof *req);
-                spin_lock_bh(&mds->mds_processing_task_lock);
-                mds->mds_next_recovery_transno++;
-                if (list_empty(&mds->mds_recovery_queue)) {
-                        mds->mds_processing_task = 0;
-                        spin_unlock_bh(&mds->mds_processing_task_lock);
-                        break;
-                }
-                spin_unlock_bh(&mds->mds_processing_task_lock);
-        }
-        EXIT;
-}
-
-static int queue_recovery_request(struct ptlrpc_request *req,
-                                  struct mds_obd *mds)
-{
-        struct list_head *tmp;
-        int inserted = 0;
-        __u64 transno = req->rq_reqmsg->transno;
-        struct ptlrpc_request *saved_req;
-
-        if (!transno) {
-                INIT_LIST_HEAD(&req->rq_list);
-                DEBUG_REQ(D_HA, req, "not queueing");
-                return 1;
-        }
-
-        spin_lock_bh(&mds->mds_processing_task_lock);
-
-        if (mds->mds_processing_task == current->pid) {
-                /* Processing the queue right now, don't re-add. */
-                LASSERT(list_empty(&req->rq_list));
-                spin_unlock_bh(&mds->mds_processing_task_lock);
-                return 1;
-        }
-
-        OBD_ALLOC(saved_req, sizeof *saved_req);
-        if (!saved_req)
-                LBUG();
-        memcpy(saved_req, req, sizeof *req);
-        req = saved_req;
-        INIT_LIST_HEAD(&req->rq_list);
-
-        /* XXX O(n^2) */
-        list_for_each(tmp, &mds->mds_recovery_queue) {
-                struct ptlrpc_request *reqiter =
-                        list_entry(tmp, struct ptlrpc_request, rq_list);
-
-                if (reqiter->rq_reqmsg->transno > transno) {
-                        list_add_tail(&req->rq_list, &reqiter->rq_list);
-                        inserted = 1;
-                        break;
-                }
-        }
-
-        if (!inserted) {
-                list_add_tail(&req->rq_list, &mds->mds_recovery_queue);
-        }
-
-        if (mds->mds_processing_task != 0) {
-                /* Someone else is processing this queue, we'll leave it to
-                 * them.
-                 */
-                if (transno == mds->mds_next_recovery_transno)
-                        wake_up(&mds->mds_next_transno_waitq);
-                spin_unlock_bh(&mds->mds_processing_task_lock);
-                return 0;
-        }
-
-        /* Nobody is processing, and we know there's (at least) one to process
-         * now, so we'll do the honours.
-         */
-        mds->mds_processing_task = current->pid;
-        spin_unlock_bh(&mds->mds_processing_task_lock);
-
-        process_recovery_queue(mds);
-        return 0;
-}
-
 static int filter_recovery_request(struct ptlrpc_request *req,
-                                   struct mds_obd *mds, int *process)
+                                   struct obd_device *obd, int *process)
 {
         switch (req->rq_reqmsg->opc) {
         case MDS_CONNECT: /* This will never get here, but for completeness. */
@@ -1160,7 +1065,7 @@ static int filter_recovery_request(struct ptlrpc_request *req,
         case MDS_GETSTATUS: /* used in unmounting */
         case MDS_REINT:
         case LDLM_ENQUEUE:
-                *process = queue_recovery_request(req, mds);
+                *process = target_queue_recovery_request(req, obd);
                 RETURN(0);
 
         default:
@@ -1171,48 +1076,6 @@ static int filter_recovery_request(struct ptlrpc_request *req,
         }
 }
 
-static int mds_queue_final_reply(struct ptlrpc_request *req, int rc)
-{
-        struct mds_obd *mds = mds_req2mds(req);
-        struct obd_device *mds_obd = list_entry(mds, struct obd_device, u.mds);
-        struct ptlrpc_request *saved_req;
-
-        spin_lock_bh(&mds->mds_processing_task_lock);
-        if (rc) {
-                /* Just like ptlrpc_error, but without the sending. */
-                lustre_pack_msg(0, NULL, NULL, &req->rq_replen,
-                                &req->rq_repmsg);
-                req->rq_type = PTL_RPC_MSG_ERR;
-        }
-
-        LASSERT(list_empty(&req->rq_list));
-        OBD_ALLOC(saved_req, sizeof *saved_req);
-        memcpy(saved_req, req, sizeof *saved_req);
-        req = saved_req;
-        list_add(&req->rq_list, &mds->mds_delayed_reply_queue);
-        if (--mds->mds_recoverable_clients == 0) {
-                struct list_head *tmp, *n;
-                ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace);
-                CDEBUG(D_ERROR,
-                       "all clients recovered, sending delayed replies\n");
-                mds_obd->obd_flags &= ~OBD_RECOVERING;
-                list_for_each_safe(tmp, n, &mds->mds_delayed_reply_queue) {
-                        req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                        DEBUG_REQ(D_ERROR, req, "delayed:");
-                        ptlrpc_reply(req->rq_svc, req);
-                        list_del(&req->rq_list);
-                        OBD_FREE(req, sizeof *req);
-                }
-                cancel_recovery_timer(mds);
-        } else {
-                CERROR("%d recoverable clients remain\n",
-                       mds->mds_recoverable_clients);
-        }
-
-        spin_unlock_bh(&mds->mds_processing_task_lock);
-        return 1;
-}
-
 static char *reint_names[] = {
         [REINT_SETATTR] "setattr",
         [REINT_CREATE]  "create",
@@ -1222,11 +1085,91 @@ static char *reint_names[] = {
         [REINT_OPEN]    "open",
 };
 
+void mds_steal_ack_locks(struct mds_export_data *med,
+                         struct ptlrpc_request *req)
+{
+        struct ptlrpc_request *oldrep = med->med_outstanding_reply;
+        memcpy(req->rq_ack_locks, oldrep->rq_ack_locks,
+               sizeof req->rq_ack_locks);
+        oldrep->rq_flags |= PTL_RPC_FL_RESENT;
+        wake_up(&oldrep->rq_wait_for_rep);
+        DEBUG_REQ(D_HA, oldrep, "stole locks from");
+        DEBUG_REQ(D_HA, req, "stole locks for");
+}
+
+static void mds_send_reply(struct ptlrpc_request *req, int rc)
+{
+        int i;
+        struct ptlrpc_req_ack_lock *ack_lock;
+        struct l_wait_info lwi;
+        struct mds_export_data *med =
+                (req->rq_export && req->rq_ack_locks[0].mode) ?
+                &req->rq_export->exp_mds_data : NULL;
+
+        if (med) {
+                med->med_outstanding_reply = req;
+                req->rq_flags |= PTL_RPC_FL_WANT_ACK;
+                init_waitqueue_head(&req->rq_wait_for_rep);
+        }
+
+        if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_ALL_REPLY_NET | OBD_FAIL_ONCE)) {
+                if (rc) {
+                        DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc);
+                        ptlrpc_error(req->rq_svc, req);
+                } else {
+                        DEBUG_REQ(D_NET, req, "sending reply");
+                        ptlrpc_reply(req->rq_svc, req);
+                }
+        } else {
+                obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
+                DEBUG_REQ(D_ERROR, req, "dropping reply");
+                if (!med && req->rq_repmsg)
+                        OBD_FREE(req->rq_repmsg, req->rq_replen);
+        }
+
+        if (!med) {
+                DEBUG_REQ(D_HA, req, "not waiting for ack");
+                return;
+        }
+
+        lwi = LWI_TIMEOUT(obd_timeout / 2 * HZ, NULL, NULL);
+        rc = l_wait_event(req->rq_wait_for_rep, 
+                          (req->rq_flags & PTL_RPC_FL_WANT_ACK) == 0 ||
+                          (req->rq_flags & PTL_RPC_FL_RESENT),
+                          &lwi);
+
+        if (req->rq_flags & PTL_RPC_FL_RESENT) {
+                /* The client resent this request, so abort the
+                 * waiting-ack portals stuff, and don't decref the
+                 * locks.
+                 */
+                DEBUG_REQ(D_HA, req, "resent: not cancelling locks");
+                ptlrpc_abort(req);
+                return;
+        }
+
+        if (rc == -ETIMEDOUT) {
+                ptlrpc_abort(req);
+                recovd_conn_fail(req->rq_export->exp_connection);
+                DEBUG_REQ(D_HA, req, "cancelling locks for timeout");
+        } else {
+                DEBUG_REQ(D_HA, req, "cancelling locks for ack");
+        }
+        
+        med->med_outstanding_reply = NULL;
+        
+        for (ack_lock = req->rq_ack_locks, i = 0; i < 4; i++, ack_lock++) {
+                if (!ack_lock->mode)
+                        break;
+                ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
+        }
+}
+
 int mds_handle(struct ptlrpc_request *req)
 {
         int should_process, rc;
         struct mds_obd *mds = NULL; /* quell gcc overwarning */
-        struct obd_device *mds_obd = NULL;
+        struct obd_device *obd = NULL;
         ENTRY;
 
         rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
@@ -1247,37 +1190,25 @@ int mds_handle(struct ptlrpc_request *req)
                 }
 
                 med = &req->rq_export->exp_mds_data;
-                mds_obd = req->rq_export->exp_obd;
-                mds = &mds_obd->u.mds;
-                spin_lock_bh(&mds->mds_processing_task_lock);
-                if (mds_obd->obd_flags & OBD_ABORT_RECOVERY)
-                        mds_abort_recovery(mds);
-                spin_unlock_bh(&mds->mds_processing_task_lock);
-
-                if (mds_obd->obd_flags & OBD_RECOVERING) {
-                        rc = filter_recovery_request(req, mds, &should_process);
+                obd = req->rq_export->exp_obd;
+                mds = &obd->u.mds;
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                if (obd->obd_flags & OBD_ABORT_RECOVERY)
+                        target_abort_recovery(obd);
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+
+                if (obd->obd_flags & OBD_RECOVERING) {
+                        rc = filter_recovery_request(req, obd, &should_process);
                         if (rc || !should_process)
                                 RETURN(rc);
-                } else if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
-                        if (req->rq_xid == med->med_last_xid) {
-                                DEBUG_REQ(D_HA, req, "resending reply");
-                                OBD_ALLOC(req->rq_repmsg, med->med_last_replen);
-                                req->rq_replen = med->med_last_replen;
-                                memcpy(req->rq_repmsg, med->med_last_reply,
-                                       req->rq_replen);
-                                ptlrpc_reply(req->rq_svc, req);
-                                return 0;
-                        }
-                        DEBUG_REQ(D_HA, req, "no reply for resend, continuing");
                 }
-
         }
 
         switch (req->rq_reqmsg->opc) {
         case MDS_CONNECT:
                 DEBUG_REQ(D_INODE, req, "connect");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_CONNECT_NET, 0);
-                rc = target_handle_connect(req);
+                rc = target_handle_connect(req, mds_handle);
                 /* Make sure that last_rcvd is correct. */
                 if (!rc) {
                         /* Now that we have an export, set mds. */
@@ -1317,8 +1248,14 @@ int mds_handle(struct ptlrpc_request *req)
                 struct lustre_handle lockh;
                 DEBUG_REQ(D_INODE, req, "getattr_name");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NAME_NET, 0);
+
+                /* If this request gets a reconstructed reply, we won't be
+                 * acquiring any new locks in mds_getattr_name, so we don't
+                 * want to cancel.
+                 */
+                lockh.addr = 0;
                 rc = mds_getattr_name(0, req, &lockh);
-                if (rc == 0)
+                if (rc == 0 && lockh.addr)
                         ldlm_lock_decref(&lockh, LCK_PR);
                 break;
         }
@@ -1329,7 +1266,7 @@ int mds_handle(struct ptlrpc_request *req)
                 break;
 
         case MDS_READPAGE:
-                DEBUG_REQ(D_INODE, req, "readpage\n");
+                DEBUG_REQ(D_INODE, req, "readpage");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_READPAGE_NET, 0);
                 rc = mds_readpage(req);
 
@@ -1408,49 +1345,23 @@ int mds_handle(struct ptlrpc_request *req)
                         DEBUG_REQ(D_IOCTL, req,
                                   "not sending last_committed update");
                 }
-                CDEBUG(D_INFO, "last_transno %Lu, last_committed %Lu, xid %d\n",
-                       (unsigned long long)mds->mds_last_rcvd,
-                       (unsigned long long)obd->obd_last_committed,
-                       cpu_to_le32(req->rq_xid));
+                CDEBUG(D_INFO, "last_transno "LPU64", last_committed "LPU64
+                       ", xid "LPU64"\n",
+                       mds->mds_last_transno, obd->obd_last_committed,
+                       NTOH__u64(req->rq_xid));
         }
  out:
 
         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
-                if (mds_obd && (mds_obd->obd_flags & OBD_RECOVERING)) {
+                if (obd && (obd->obd_flags & OBD_RECOVERING)) {
                         DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
-                        return mds_queue_final_reply(req, rc);
+                        return target_queue_final_reply(req, rc);
                 }
                 /* Lost a race with recovery; let the error path DTRT. */
                 rc = req->rq_status = -ENOTCONN;
         }
 
-        if (req->rq_export && mds_obd &&
-            (mds_obd->obd_flags & OBD_RECOVERING) == 0) {
-                struct mds_export_data *med = &req->rq_export->exp_mds_data;
-                if (med->med_last_reply)
-                        OBD_FREE(med->med_last_reply, med->med_last_replen);
-                OBD_ALLOC(med->med_last_reply, req->rq_replen);
-                med->med_last_replen = req->rq_replen;
-                med->med_last_xid = req->rq_xid;
-                memcpy(med->med_last_reply, req->rq_repmsg, req->rq_replen);
-                /* XXX serialize */
-        }
-
-        if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_ALL_REPLY_NET | OBD_FAIL_ONCE)) {
-                if (rc) {
-                        DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc);
-                        ptlrpc_error(req->rq_svc, req);
-                } else {
-                        DEBUG_REQ(D_NET, req, "sending reply");
-                        ptlrpc_reply(req->rq_svc, req);
-                }
-        } else {
-                obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
-                DEBUG_REQ(D_ERROR, req, "dropping reply");
-                if (req->rq_repmsg)
-                        OBD_FREE(req->rq_repmsg, req->rq_replen);
-        }
-
+        mds_send_reply(req, rc);
         return 0;
 }
 
@@ -1459,7 +1370,7 @@ int mds_handle(struct ptlrpc_request *req)
  * then the server last_rcvd value may be less than that of the clients.
  * This will alert us that we may need to do client recovery.
  *
- * Also assumes for mds_last_rcvd that we are not modifying it (no locking).
+ * Also assumes for mds_last_transno that we are not modifying it (no locking).
  */
 int mds_update_server_data(struct mds_obd *mds)
 {
@@ -1470,12 +1381,12 @@ int mds_update_server_data(struct mds_obd *mds)
         int rc;
 
         push_ctxt(&saved, &mds->mds_ctxt, NULL);
-        msd->msd_last_rcvd = cpu_to_le64(mds->mds_last_rcvd);
+        msd->msd_last_transno = cpu_to_le64(mds->mds_last_transno);
         msd->msd_mount_count = cpu_to_le64(mds->mds_mount_count);
 
-        CDEBUG(D_SUPER, "MDS mount_count is %Lu, last_rcvd is %Lu\n",
+        CDEBUG(D_SUPER, "MDS mount_count is %Lu, last_transno is %Lu\n",
                (unsigned long long)mds->mds_mount_count,
-               (unsigned long long)mds->mds_last_rcvd);
+               (unsigned long long)mds->mds_last_transno);
         rc = lustre_fwrite(filp, (char *)msd, sizeof(*msd), &off);
         if (rc != sizeof(*msd)) {
                 CERROR("error writing MDS server data: rc = %d\n", rc);
@@ -1486,7 +1397,7 @@ int mds_update_server_data(struct mds_obd *mds)
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         rc = fsync_dev(filp->f_dentry->d_inode->i_rdev);
 #else
-        rc = file_fsync(filp,  filp->f_dentry, 1);
+        rc = file_fsync(filp, filp->f_dentry, 1);
 #endif
         if (rc)
                 CERROR("error flushing MDS server data: rc = %d\n", rc);
@@ -1527,7 +1438,7 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf)
         if (!mds->mds_sb)
                 GOTO(err_put, rc = -ENODEV);
 
-        init_MUTEX(&mds->mds_transno_sem);
+        spin_lock_init(&mds->mds_transno_lock);
         mds->mds_max_mdsize = sizeof(struct lov_mds_md);
         rc = mds_fs_setup(obddev, mnt);
         if (rc) {
@@ -1535,9 +1446,6 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf)
                 GOTO(err_put, rc);
         }
 
-        if (obddev->obd_flags & OBD_RECOVERING)
-                start_recovery_timer(mds);
-
         obddev->obd_namespace =
                 ldlm_namespace_new("mds_server", LDLM_NAMESPACE_SERVER);
         if (obddev->obd_namespace == NULL) {
@@ -1548,12 +1456,7 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf)
         ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
                            "mds_ldlm_client", &obddev->obd_ldlm_client);
 
-        spin_lock_init(&mds->mds_processing_task_lock);
-        mds->mds_processing_task = 0;
         mds->mds_has_lov_desc = 0;
-        INIT_LIST_HEAD(&mds->mds_recovery_queue);
-        INIT_LIST_HEAD(&mds->mds_delayed_reply_queue);
-        init_waitqueue_head(&mds->mds_next_transno_waitq);
 
         RETURN(0);
 
@@ -1566,7 +1469,7 @@ err_put:
         lock_kernel();
 err_ops:
         fsfilt_put_ops(obddev->obd_fsops);
-        RETURN(rc);
+        return rc;
 }
 
 static int mds_cleanup(struct obd_device *obddev)
@@ -1597,6 +1500,23 @@ static int mds_cleanup(struct obd_device *obddev)
         RETURN(0);
 }
 
+inline void fixup_handle_for_resent_req(struct ptlrpc_request *req,
+                                        struct lustre_handle *lockh)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct mds_client_data *mcd = med->med_mcd;
+        struct ptlrpc_request *oldrep = med->med_outstanding_reply;
+        struct ldlm_reply *dlm_rep;
+
+        if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) &&
+            (mcd->mcd_last_xid == req->rq_xid) && (oldrep != NULL)) {
+                DEBUG_REQ(D_HA, req, "restoring lock handle from %p", oldrep);
+                dlm_rep = lustre_msg_buf(oldrep->rq_repmsg, 0);
+                lockh->addr = dlm_rep->lock_handle.addr;
+                lockh->cookie = dlm_rep->lock_handle.cookie;
+        }
+}
+
 static int ldlm_intent_policy(struct ldlm_namespace *ns,
                               struct ldlm_lock **lockp, void *req_cookie,
                               ldlm_mode_t mode, int flags, void *data)
@@ -1636,6 +1556,8 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns,
                 rep = lustre_msg_buf(req->rq_repmsg, 0);
                 rep->lock_policy_res1 = IT_INTENT_EXEC;
 
+                fixup_handle_for_resent_req(req, &lockh);
+
                 /* execute policy */
                 switch ((long)it->opc) {
                 case IT_OPEN:
@@ -1671,7 +1593,6 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns,
                 case IT_GETATTR:
                 case IT_LOOKUP:
                 case IT_READDIR:
-                case IT_SETATTR:
                         rc = mds_getattr_name(offset, req, &lockh);
                         /* FIXME: we need to sit down and decide on who should
                          * set req->rq_status, who should return negative and
@@ -1691,7 +1612,7 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns,
                 }
 
                 if (flags & LDLM_FL_INTENT_ONLY) {
-                        LDLM_DEBUG0(lock, "INTENT_ONLY, aborting lock");
+                        LDLM_DEBUG(lock, "INTENT_ONLY, aborting lock");
                         RETURN(ELDLM_LOCK_ABORTED);
                 }
 
@@ -1701,9 +1622,18 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns,
                  * to get. */
                 new_lock = ldlm_handle2lock(&lockh);
                 LASSERT(new_lock != NULL);
-                mds_body = lustre_msg_buf(req->rq_repmsg, 1);
                 *lockp = new_lock;
 
+                rep->lock_policy_res2 = req->rq_status;
+
+                if (new_lock->l_export == req->rq_export) {
+                        /* Already gave this to the client, which means that we
+                         * reconstructed a reply. */
+                        LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & 
+                                MSG_RESENT);
+                        RETURN(ELDLM_LOCK_REPLACED);
+                }
+
                 /* Fixup the lock to be given to the client */
                 l_lock(&new_lock->l_resource->lr_namespace->ns_lock);
                 LASSERT(new_lock->l_readers + new_lock->l_writers == 1);
@@ -1728,8 +1658,6 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns,
                 LDLM_LOCK_PUT(new_lock);
                 l_unlock(&new_lock->l_resource->lr_namespace->ns_lock);
 
-                rep->lock_policy_res2 = req->rq_status;
-
                 RETURN(ELDLM_LOCK_REPLACED);
         } else {
                 int size = sizeof(struct ldlm_reply);
@@ -1772,14 +1700,13 @@ int mdt_detach(struct obd_device *dev)
 static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
 {
         struct mds_obd *mds = &obddev->u.mds;
-        struct obd_uuid uuid = { "self" };
         int i, rc = 0;
         ENTRY;
 
         mds->mds_service = ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
                                            MDS_BUFSIZE, MDS_MAXREQSIZE,
                                            MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL,
-                                           &uuid, mds_handle, "mds");
+                                           mds_handle, "mds");
         if (!mds->mds_service) {
                 CERROR("failed to start service\n");
                 RETURN(rc = -ENOMEM);
@@ -1795,12 +1722,12 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
                 }
         }
 
-        mds->mds_getattr_service =
+        mds->mds_setattr_service =
                 ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
                                 MDS_BUFSIZE, MDS_MAXREQSIZE,
-                                MDS_GETATTR_PORTAL, MDC_REPLY_PORTAL,
-                                &uuid, mds_handle, "mds");
-        if (!mds->mds_getattr_service) {
+                                MDS_SETATTR_PORTAL, MDC_REPLY_PORTAL,
+                                mds_handle, "mds");
+        if (!mds->mds_setattr_service) {
                 CERROR("failed to start getattr service\n");
                 GOTO(err_thread, rc = -ENOMEM);
         }
@@ -1808,20 +1735,45 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
         for (i = 0; i < MDT_NUM_THREADS; i++) {
                 char name[32];
                 sprintf(name, "ll_mdt_attr_%02d", i);
-                rc = ptlrpc_start_thread(obddev, mds->mds_getattr_service,
+                rc = ptlrpc_start_thread(obddev, mds->mds_setattr_service,
                                          name);
                 if (rc) {
-                        CERROR("cannot start MDT getattr thread #%d: rc %d\n",
+                        CERROR("cannot start MDT setattr thread #%d: rc %d\n",
                                i, rc);
                         GOTO(err_thread2, rc);
                 }
         }
 
+        mds->mds_readpage_service =
+                ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
+                                MDS_BUFSIZE, MDS_MAXREQSIZE,
+                                MDS_READPAGE_PORTAL, MDC_REPLY_PORTAL,
+                                mds_handle, "mds");
+        if (!mds->mds_readpage_service) {
+                CERROR("failed to start readpage service\n");
+                GOTO(err_thread2, rc = -ENOMEM);
+        }
+
+        for (i = 0; i < MDT_NUM_THREADS; i++) {
+                char name[32];
+                sprintf(name, "ll_mdt_rdpg_%02d", i);
+                rc = ptlrpc_start_thread(obddev, mds->mds_readpage_service,
+                                         name);
+                if (rc) {
+                        CERROR("cannot start MDT readpage thread #%d: rc %d\n",
+                               i, rc);
+                        GOTO(err_thread3, rc);
+                }
+        }
+
         RETURN(0);
 
+err_thread3:
+        ptlrpc_stop_all_threads(mds->mds_readpage_service);
+        ptlrpc_unregister_service(mds->mds_readpage_service);
 err_thread2:
-        ptlrpc_stop_all_threads(mds->mds_getattr_service);
-        ptlrpc_unregister_service(mds->mds_getattr_service);
+        ptlrpc_stop_all_threads(mds->mds_setattr_service);
+        ptlrpc_unregister_service(mds->mds_setattr_service);
 err_thread:
         ptlrpc_stop_all_threads(mds->mds_service);
         ptlrpc_unregister_service(mds->mds_service);
@@ -1834,8 +1786,11 @@ static int mdt_cleanup(struct obd_device *obddev)
         struct mds_obd *mds = &obddev->u.mds;
         ENTRY;
 
-        ptlrpc_stop_all_threads(mds->mds_getattr_service);
-        ptlrpc_unregister_service(mds->mds_getattr_service);
+        ptlrpc_stop_all_threads(mds->mds_readpage_service);
+        ptlrpc_unregister_service(mds->mds_readpage_service);
+
+        ptlrpc_stop_all_threads(mds->mds_setattr_service);
+        ptlrpc_unregister_service(mds->mds_setattr_service);
 
         ptlrpc_stop_all_threads(mds->mds_service);
         ptlrpc_unregister_service(mds->mds_service);
index eab0cf7..e4522fb 100644 (file)
  */
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
+#endif
 #include <linux/lustre_lite.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/lprocfs_status.h>
@@ -37,7 +41,10 @@ static inline
 int lprocfs_mds_statfs(void *data, struct statfs *sfs)
 {
         struct obd_device* dev = (struct obd_device*) data;
-        struct mds_obd *mds = &dev->u.mds;
+        struct mds_obd *mds;
+
+        LASSERT(dev != NULL);
+        mds = &dev->u.mds;
         return vfs_statfs(mds->mds_sb, sfs);
 }
 
@@ -53,6 +60,9 @@ int rd_fstype(char *page, char **start, off_t off, int count, int *eof,
 {
         struct obd_device *obd = (struct obd_device *)data;
 
+        LASSERT(obd != NULL);
+        LASSERT(obd->obd_fsops != NULL);
+        LASSERT(obd->obd_fsops->fs_type != NULL);
         return snprintf(page, count, "%s\n", obd->obd_fsops->fs_type);
 }
 
index bf04553..39e8592 100644 (file)
 
 #include <linux/module.h>
 #include <linux/kmod.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/mount.h>
+#endif
 #include <linux/lustre_mds.h>
 #include <linux/obd_class.h>
 #include <linux/obd_support.h>
@@ -142,10 +146,6 @@ int mds_client_free(struct obd_export *exp)
                        med->med_mcd->mcd_uuid, med->med_off);
         }
 
-        if (med->med_last_reply) {
-                OBD_FREE(med->med_last_reply, med->med_last_replen);
-                med->med_last_reply = NULL;
-        }
         OBD_FREE(med->med_mcd, sizeof(*med->med_mcd));
 
         return 0;
@@ -167,7 +167,7 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
         loff_t off = 0;
         int cl_off;
         unsigned long last_rcvd_size = f->f_dentry->d_inode->i_size;
-        __u64 last_rcvd = 0;
+        __u64 last_transno = 0;
         __u64 last_mount;
         int rc = 0;
 
@@ -193,13 +193,13 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
                last_rcvd_size, (last_rcvd_size - MDS_LR_CLIENT)/MDS_LR_SIZE);
 
         /*
-         * When we do a clean MDS shutdown, we save the last_rcvd into
-         * the header.  If we find clients with higher last_rcvd values
-         * then those clients may need recovery done.
+         * When we do a clean MDS shutdown, we save the last_transno into
+         * the header.
          */
-        last_rcvd = le64_to_cpu(msd->msd_last_rcvd);
-        mds->mds_last_rcvd = last_rcvd;
-        CDEBUG(D_INODE, "got "LPU64" for server last_rcvd value\n", last_rcvd);
+        last_transno = le64_to_cpu(msd->msd_last_transno);
+        mds->mds_last_transno = last_transno;
+        CDEBUG(D_INODE, "got "LPU64" for server last_rcvd value\n",
+               last_transno);
 
         last_mount = le64_to_cpu(msd->msd_mount_count);
         mds->mds_mount_count = last_mount;
@@ -230,7 +230,7 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
                         continue;
                 }
 
-                last_rcvd = le64_to_cpu(mcd->mcd_last_rcvd);
+                last_transno = le64_to_cpu(mcd->mcd_last_transno);
 
                 /* These exports are cleaned up by mds_disconnect(), so they
                  * need to be set up like real exports as mds_connect() does.
@@ -255,7 +255,7 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
                         spin_lock_init(&med->med_open_lock);
 
                         mcd = NULL;
-                        mds->mds_recoverable_clients++;
+                        obddev->obd_recoverable_clients++;
                 } else {
                         CDEBUG(D_INFO,
                                "discarded client %d, UUID '%s', count %Ld\n",
@@ -264,17 +264,19 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
                 }
 
                 CDEBUG(D_OTHER, "client at offset %d has last_rcvd = %Lu\n",
-                       cl_off, (unsigned long long)last_rcvd);
+                       cl_off, (unsigned long long)last_transno);
 
-                if (last_rcvd > mds->mds_last_rcvd)
-                        mds->mds_last_rcvd = last_rcvd;
+                if (last_transno > mds->mds_last_transno)
+                        mds->mds_last_transno = last_transno;
         }
 
-        obddev->obd_last_committed = mds->mds_last_rcvd;
-        if (mds->mds_recoverable_clients) {
-                CERROR("RECOVERY: %d recoverable clients, last_rcvd "LPU64"\n",
-                       mds->mds_recoverable_clients, mds->mds_last_rcvd);
-                mds->mds_next_recovery_transno = obddev->obd_last_committed + 1;
+        obddev->obd_last_committed = mds->mds_last_transno;
+        if (obddev->obd_recoverable_clients) {
+                CERROR("RECOVERY: %d recoverable clients, last_transno "
+                       LPU64"\n",
+                       obddev->obd_recoverable_clients, mds->mds_last_transno);
+                obddev->obd_next_recovery_transno = obddev->obd_last_committed
+                        + 1;
                 obddev->obd_flags |= OBD_RECOVERING;
         }
 
index fef9a0d..796fcd2 100644 (file)
@@ -243,7 +243,9 @@ int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn,
             case OBD_IOC_SET_READONLY:
                 CERROR("setting device %s read-only\n",
                        ll_bdevname(obd->u.mds.mds_sb->s_dev));
+#ifdef CONFIG_DEV_RDONLY
                 dev_set_rdonly(obd->u.mds.mds_sb->s_dev, 2);
+#endif
                 RETURN(0);
 
         default:
index 2f65384..50ca592 100644 (file)
@@ -1,10 +1,7 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- *  lustre/mds/handler.c
- *  Lustre Metadata Server (mds) request handler
- *
- *  Copyright (c) 2001, 2002 Cluster File Systems, Inc.
+ *  Copyright (c) 2003 Cluster File Systems, Inc.
  *   Author: Peter Braam <braam@clusterfs.com>
  *   Author: Andreas Dilger <adilger@clusterfs.com>
  *   Author: Phil Schwan <phil@clusterfs.com>
 #include <linux/init.h>
 #include <linux/obd_class.h>
 #include <linux/random.h>
-#include <linux/locks.h>
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
+#else
+#include <linux/locks.h>
 #endif
 #include <linux/obd_lov.h>
 #include <linux/lustre_mds.h>
@@ -47,9 +45,8 @@
 
 extern kmem_cache_t *mds_file_cache;
 extern inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req);
-extern void mds_start_transno(struct mds_obd *mds);
-extern int mds_finish_transno(struct mds_obd *mds, void *handle,
-                              struct ptlrpc_request *req, int rc);
+int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle,
+                       struct ptlrpc_request *req, int rc, __u32 op_data);
 extern int enqueue_ordered_locks(int lock_mode, struct obd_device *obd,
                                  struct ldlm_res_id *p1_res_id,
                                  struct ldlm_res_id *p2_res_id,
@@ -60,6 +57,142 @@ extern int enqueue_ordered_locks(int lock_mode, struct obd_device *obd,
                                  struct lustre_handle *c1_lockh,
                                  struct lustre_handle *c2_lockh);
 
+void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req,
+                      struct lustre_handle *child_lockh)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct mds_client_data *mcd = med->med_mcd;
+        struct mds_obd *mds = mds_req2mds(req);
+        struct mds_file_data *mfd;
+        struct obd_device *obd = req->rq_export->exp_obd;
+        struct dentry *parent, *child;
+        struct ldlm_reply *rep = lustre_msg_buf(req->rq_repmsg, 0);
+        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1);
+        int disp, rc;
+        ENTRY;
+
+        ENTRY;
+
+        /* copy rc, transno and disp; steal locks */
+        req->rq_transno = mcd->mcd_last_transno;
+        req->rq_status = mcd->mcd_last_result;
+        disp = rep->lock_policy_res1 = mcd->mcd_last_data;
+        
+        if (med->med_outstanding_reply)
+                mds_steal_ack_locks(med, req);
+        
+        /* We never care about these. */
+        disp &= ~(IT_OPEN_LOOKUP | IT_OPEN_POS | IT_OPEN_NEG);
+        if (!disp) {
+                EXIT;
+                return; /* error looking up parent or child */
+        }
+
+        parent = mds_fid2dentry(mds, rec->ur_fid1, NULL);
+        LASSERT(!IS_ERR(parent));
+
+        child = lookup_one_len(lustre_msg_buf(req->rq_reqmsg, 3),
+                               parent, req->rq_reqmsg->buflens[3] - 1);
+        LASSERT(!IS_ERR(child));
+        
+        if (!child->d_inode) {
+                GOTO(out_dput, 0); /* child not present to open */
+        }
+
+        /* At this point, we know we have a child, which means that we'll send
+         * it back _unless_ it was open failed, _and_ we didn't create the file.
+         * I love you guys.  No, really.
+         */
+        if (((disp & (IT_OPEN_OPEN | IT_OPEN_CREATE)) == IT_OPEN_OPEN) &&
+            req->rq_status) {
+                GOTO(out_dput, 0);
+        }
+
+        if (!med->med_outstanding_reply) {
+                LBUG(); /* XXX need to get enqueue client lock */
+        }
+
+        /* get lock (write for O_CREAT, read otherwise) */
+        
+        mds_pack_inode2fid(&body->fid1, child->d_inode);
+        mds_pack_inode2body(body, child->d_inode);
+        if (S_ISREG(child->d_inode->i_mode)) {
+                rc = mds_pack_md(obd, req->rq_repmsg, 2, body,
+                                 child->d_inode);
+                if (rc)
+                        LASSERT(rc == req->rq_status);
+        } else {
+                /* XXX need to check this case */
+        }
+
+        /* If we're opening a file without an EA, change to a write
+           lock (unless we already have one). */
+                   
+        /* If we have -EEXIST as the status, and we were asked to create
+         * exclusively, we can tell we failed because the file already existed.
+         */
+        if (req->rq_status == -EEXIST &&
+            ((rec->ur_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))) {
+                GOTO(out_dput, 0);
+        }
+
+        /* If we didn't get as far as trying to open, then some locking thing
+         * probably went wrong, and we'll just bail here.
+         */
+        if ((disp & IT_OPEN_OPEN) == 0) {
+                GOTO(out_dput, 0);
+        }
+
+        /* If we failed, then we must have failed opening, so don't look for
+         * file descriptor or anything, just give the client the bad news.
+         */
+        if (req->rq_status) {
+                GOTO(out_dput, 0);
+        }
+
+        if (med->med_outstanding_reply) {
+                struct list_head *t;
+                mfd = NULL;
+                /* XXX can we just look in the old reply to find the handle in
+                 * XXX O(1) here? */
+                list_for_each(t, &med->med_open_head) {
+                        mfd = list_entry(t, struct mds_file_data, mfd_list);
+                        if (mfd->mfd_xid == req->rq_xid)
+                                break;
+                        mfd = NULL;
+                }
+                /* if we're not recovering, it had better be found */
+                LASSERT(mfd);
+        } else {
+                struct file *file;
+                mfd = kmem_cache_alloc(mds_file_cache, GFP_KERNEL);
+                if (!mfd) {
+                        CERROR("mds: out of memory\n");
+                        GOTO(out_dput, req->rq_status = -ENOMEM);
+                }
+                mntget(mds->mds_vfsmnt);
+                file = dentry_open(child, mds->mds_vfsmnt,
+                                   rec->ur_flags & ~(O_DIRECT | O_TRUNC));
+                LASSERT(!IS_ERR(file)); /* XXX -ENOMEM? */
+                file->private_data = mfd;
+                mfd->mfd_file = file;
+                mfd->mfd_xid = req->rq_xid;
+                get_random_bytes(&mfd->mfd_servercookie,
+                                 sizeof(mfd->mfd_servercookie));
+                spin_lock(&med->med_open_lock);
+                list_add(&mfd->mfd_list, &med->med_open_head);
+                spin_unlock(&med->med_open_lock);
+        }
+                
+        body->handle.addr = (__u64)(unsigned long)mfd;
+        body->handle.cookie = mfd->mfd_servercookie;
+
+ out_dput:
+        l_dput(child);
+        l_dput(parent);
+        EXIT;
+}
+
 int mds_open(struct mds_update_record *rec, int offset,
              struct ptlrpc_request *req, struct lustre_handle *child_lockh)
 {
@@ -68,33 +201,19 @@ int mds_open(struct mds_update_record *rec, int offset,
         struct ldlm_reply *rep = lustre_msg_buf(req->rq_repmsg, 0);
         struct file *file;
         struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1);
-        struct dentry *dchild, *parent;
+        struct dentry *dchild = NULL, *parent;
         struct mds_export_data *med;
         struct mds_file_data *mfd = NULL;
         struct ldlm_res_id child_res_id = { .name = {0} };
         struct lustre_handle parent_lockh;
         int rc = 0, parent_mode, child_mode = LCK_PR, lock_flags, created = 0;
+        int cleanup_phase = 0;
+        void *handle = NULL;
         ENTRY;
 
-#warning replay of open needs to be redone
-        /* was this animal open already and the client lost the reply? */
-        /* XXX need some way to detect a reopen, to avoid locked list walks */
+        MDS_CHECK_RESENT(req, reconstruct_open(rec, req, child_lockh));
+
         med = &req->rq_export->exp_mds_data;
-#if 0
-        spin_lock(&med->med_open_lock);
-        list_for_each(tmp, &med->med_open_head) {
-                mfd = list_entry(tmp, typeof(*mfd), mfd_list);
-                if (!memcmp(&mfd->mfd_clienthandle, &body->handle,
-                            sizeof(mfd->mfd_clienthandle)) &&
-                    body->fid1.id == mfd->mfd_file->f_dentry->d_inode->i_ino) {
-                        dchild = mfd->mfd_file->f_dentry;
-                        spin_unlock(&med->med_open_lock);
-                        CERROR("Re opening "LPD64"\n", body->fid1.id);
-                        GOTO(out_pack, rc = 0);
-                }
-        }
-        spin_unlock(&med->med_open_lock);
-#endif
         rep->lock_policy_res1 |= IT_OPEN_LOOKUP;
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OPEN_PACK)) {
                 CERROR("test case OBD_FAIL_MDS_OPEN_PACK\n");
@@ -109,16 +228,19 @@ int mds_open(struct mds_update_record *rec, int offset,
         if (IS_ERR(parent)) {
                 rc = PTR_ERR(parent);
                 CERROR("parent lookup error %d\n", rc);
-                LBUG();
-                RETURN(rc);
+                GOTO(cleanup, rc);
         }
         LASSERT(parent->d_inode);
 
+        cleanup_phase = 1; /* parent dentry and lock */
+
         /* Step 2: Lookup the child */
         dchild = lookup_one_len(lustre_msg_buf(req->rq_reqmsg, 3),
                                 parent, req->rq_reqmsg->buflens[3] - 1);
         if (IS_ERR(dchild))
-                GOTO(out_step_2, rc = PTR_ERR(dchild));
+                GOTO(cleanup, rc = PTR_ERR(dchild));
+
+        cleanup_phase = 2; /* child dentry */
 
         if (dchild->d_inode)
                 rep->lock_policy_res1 |= IT_OPEN_POS;
@@ -127,31 +249,24 @@ int mds_open(struct mds_update_record *rec, int offset,
 
         /* Step 3: If the child was negative, and we're supposed to,
          * create it. */
-        if ((rec->ur_flags & O_CREAT) && !dchild->d_inode) {
-                int err;
-                void *handle;
-                mds_start_transno(mds);
+        if (!dchild->d_inode) {
+                if (!(rec->ur_flags & O_CREAT)) {
+                        /* It's negative and we weren't supposed to create it */
+                        GOTO(cleanup, rc = -ENOENT);
+                }
+
                 rep->lock_policy_res1 |= IT_OPEN_CREATE;
                 handle = fsfilt_start(obd, parent->d_inode, FSFILT_OP_CREATE);
                 if (IS_ERR(handle)) {
                         rc = PTR_ERR(handle);
-                        mds_finish_transno(mds, handle, req, rc);
-                        GOTO(out_step_3, rc);
+                        handle = NULL;
+                        GOTO(cleanup, rc);
                 }
                 rc = vfs_create(parent->d_inode, dchild, rec->ur_mode);
-                rc = mds_finish_transno(mds, handle, req, rc);
-                err = fsfilt_commit(obd, parent->d_inode, handle);
-                if (rc || err) {
-                        CERROR("error on commit: err = %d\n", err);
-                        if (!rc)
-                                rc = err;
-                        GOTO(out_step_3, rc);
-                }
+                if (rc)
+                        GOTO(cleanup, rc);
                 created = 1;
                 child_mode = LCK_PW;
-        } else if (!dchild->d_inode) {
-                /* It's negative and we weren't supposed to create it */
-                GOTO(out_step_3, rc = -ENOENT);
         }
 
         /* Step 4: It's positive, so lock the child */
@@ -165,30 +280,36 @@ int mds_open(struct mds_update_record *rec, int offset,
                               mds_blocking_ast, NULL, NULL, child_lockh);
         if (rc != ELDLM_OK) {
                 CERROR("ldlm_cli_enqueue: %d\n", rc);
-                GOTO(out_step_3, rc = -EIO);
+                GOTO(cleanup, rc = -EIO);
         }
 
+        cleanup_phase = 3; /* child lock */
+
         mds_pack_inode2fid(&body->fid1, dchild->d_inode);
         mds_pack_inode2body(body, dchild->d_inode);
         if (S_ISREG(dchild->d_inode->i_mode)) {
                 rc = mds_pack_md(obd, req->rq_repmsg, 2, body, dchild->d_inode);
                 if (rc)
-                        GOTO(out_step_4, rc);
+                        GOTO(cleanup, rc);
         } else {
                 /* If this isn't a regular file, we can't open it. */
-                GOTO(out_step_3, rc = 0); /* returns the lock to the client */
+
+                /* We want to drop the child dentry, because we're not returning
+                 * failure (which would do this for us in step 2), and we're not
+                 * handing it off to the open file in dentry_open. */
+                l_dput(dchild);
+                GOTO(cleanup, rc = 0); /* returns the lock to the client */
         }
 
         if (!created && (rec->ur_flags & O_CREAT) && (rec->ur_flags & O_EXCL)) {
                 /* File already exists, we didn't just create it, and we
                  * were passed O_EXCL; err-or. */
-                GOTO(out_step_3, rc = -EEXIST); // returns a lock to the client
+                GOTO(cleanup, rc = -EEXIST); // returns a lock to the client
         }
 
         /* If we're opening a file without an EA, the client needs a write
          * lock. */
-        if (child_mode != LCK_PW && S_ISREG(dchild->d_inode->i_mode) &&
-            !(body->valid & OBD_MD_FLEASIZE)) {
+        if (child_mode != LCK_PW && !(body->valid & OBD_MD_FLEASIZE)) {
                 ldlm_lock_decref(child_lockh, child_mode);
                 child_mode = LCK_PW;
                 goto reacquire;
@@ -199,18 +320,23 @@ int mds_open(struct mds_update_record *rec, int offset,
         mfd = kmem_cache_alloc(mds_file_cache, GFP_KERNEL);
         if (!mfd) {
                 CERROR("mds: out of memory\n");
-                GOTO(out_step_4, req->rq_status = -ENOMEM);
+                GOTO(cleanup, rc = -ENOMEM);
         }
 
+        cleanup_phase = 4; /* mfd allocated */
+
         /* dentry_open does a dput(de) and mntput(mds->mds_vfsmnt) on error */
         mntget(mds->mds_vfsmnt);
-        file = dentry_open(dchild,mds->mds_vfsmnt,
+        file = dentry_open(dchild, mds->mds_vfsmnt,
                            rec->ur_flags & ~(O_DIRECT | O_TRUNC));
-        if (IS_ERR(file))
-                GOTO(out_step_5, rc = PTR_ERR(file));
+        if (IS_ERR(file)) {
+                dchild = NULL; /* prevent a double dput in step 2 */
+                GOTO(cleanup, rc = PTR_ERR(file));
+        }
 
         file->private_data = mfd;
         mfd->mfd_file = file;
+        mfd->mfd_xid = req->rq_xid;
         get_random_bytes(&mfd->mfd_servercookie, sizeof(mfd->mfd_servercookie));
         spin_lock(&med->med_open_lock);
         list_add(&mfd->mfd_list, &med->med_open_head);
@@ -220,19 +346,34 @@ int mds_open(struct mds_update_record *rec, int offset,
         body->handle.cookie = mfd->mfd_servercookie;
         CDEBUG(D_INODE, "file %p: mfd %p, cookie "LPX64"\n",
                mfd->mfd_file, mfd, mfd->mfd_servercookie);
-        GOTO(out_step_2, rc = 0); /* returns a lock to the client */
+        GOTO(cleanup, rc = 0); /* returns a lock to the client */
 
- out_step_5:
-        if (mfd != NULL) {
-                kmem_cache_free(mds_file_cache, mfd);
-                mfd = NULL;
+ cleanup:
+        rc = mds_finish_transno(mds, dchild ? dchild->d_inode : NULL, handle,
+                                req, rc, rep->lock_policy_res1);
+        switch (cleanup_phase) {
+        case 4:
+                if (rc)
+                        kmem_cache_free(mds_file_cache, mfd);
+        case 3:
+                /* This is the same logic as in the IT_OPEN part of 
+                 * ldlm_intent_policy: if we found the dentry, or we tried to
+                 * open it (meaning that we created, if it wasn't found), then
+                 * we return the lock to the caller and client. */
+                if (!(rep->lock_policy_res1 & (IT_OPEN_OPEN | IT_OPEN_POS)))
+                        ldlm_lock_decref(child_lockh, child_mode);
+        case 2:
+                if (rc) 
+                    l_dput(dchild);
+        case 1:
+                l_dput(parent);
+                if (rc) {
+                        ldlm_lock_decref(&parent_lockh, parent_mode);
+                } else {
+                        memcpy(&req->rq_ack_locks[0].lock, &parent_lockh,
+                               sizeof(parent_lockh));
+                        req->rq_ack_locks[0].mode = parent_mode;
+                }
         }
- out_step_4:
-        ldlm_lock_decref(child_lockh, child_mode);
- out_step_3:
-        l_dput(dchild);
- out_step_2:
-        l_dput(parent);
-        ldlm_lock_decref(&parent_lockh, parent_mode);
         RETURN(rc);
 }
index 608747f..583ba4a 100644 (file)
@@ -28,6 +28,7 @@
 #define EXPORT_SYMTAB
 #define DEBUG_SUBSYSTEM S_MDS
 
+#include <linux/fs.h>
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
 #include <linux/obd.h>
@@ -47,61 +48,206 @@ static void mds_last_rcvd_cb(struct obd_device *obd, __u64 last_rcvd, int error)
                 obd->obd_last_committed = last_rcvd;
 }
 
-void mds_start_transno(struct mds_obd *mds)
-{
-        ENTRY;
-        down(&mds->mds_transno_sem);
-}
-
 /* Assumes caller has already pushed us into the kernel context. */
-int mds_finish_transno(struct mds_obd *mds, void *handle,
-                       struct ptlrpc_request *req, int rc)
+int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle,
+                       struct ptlrpc_request *req, int rc,
+                       __u32 op_data)
 {
         struct mds_export_data *med = &req->rq_export->exp_mds_data;
         struct mds_client_data *mcd = med->med_mcd;
-        __u64 last_rcvd;
+        struct obd_device *obd = req->rq_export->exp_obd;
+        int started_handle = 0, err;
+        __u64 transno;
         loff_t off;
         ssize_t written;
-
-        /* Propagate error code. */
-        if (rc)
-                GOTO(out, rc);
+        ENTRY;
 
         /* we don't allocate new transnos for replayed requests */
         if (req->rq_level == LUSTRE_CONN_RECOVD)
-                GOTO(out, rc = 0);
+                GOTO(out, rc = rc);
+
+        if (!handle) {
+                /* if we're starting our own xaction, use our own inode */
+                i = mds->mds_rcvd_filp->f_dentry->d_inode;
+                handle = fsfilt_start(obd, i, FSFILT_OP_SETATTR);
+                if (IS_ERR(handle)) {
+                        CERROR("fsfilt_start: %ld\n", PTR_ERR(handle));
+                        GOTO(out, rc = PTR_ERR(handle));
+                }
+                started_handle = 1;
+        }
 
         off = MDS_LR_CLIENT + med->med_off * MDS_LR_SIZE;
 
-        last_rcvd = ++mds->mds_last_rcvd;
-        req->rq_repmsg->transno = HTON__u64(last_rcvd);
-        mcd->mcd_last_rcvd = cpu_to_le64(last_rcvd);
+        spin_lock(&mds->mds_transno_lock);
+        transno = ++mds->mds_last_transno;
+        spin_unlock(&mds->mds_transno_lock);
+        req->rq_repmsg->transno = req->rq_transno = HTON__u64(transno);
+        mcd->mcd_last_transno = cpu_to_le64(transno);
         mcd->mcd_mount_count = cpu_to_le64(mds->mds_mount_count);
         mcd->mcd_last_xid = cpu_to_le64(req->rq_xid);
+        mcd->mcd_last_result = cpu_to_le32(rc);
+        mcd->mcd_last_data = cpu_to_le32(op_data);
 
-        fsfilt_set_last_rcvd(req->rq_export->exp_obd, last_rcvd, handle,
+        fsfilt_set_last_rcvd(req->rq_export->exp_obd, transno, handle,
                              mds_last_rcvd_cb);
         written = lustre_fwrite(mds->mds_rcvd_filp, (char *)mcd, sizeof(*mcd),
                                 &off);
-        CDEBUG(D_INODE, "wrote trans #"LPD64" for client %s at #%d: written = "
-               LPSZ"\n", last_rcvd, mcd->mcd_uuid, med->med_off, written);
-
-        if (written == sizeof(*mcd))
-                GOTO(out, rc = 0);
-        CERROR("error writing to last_rcvd file: rc = %d\n", rc);
-        if (written >= 0)
-                GOTO(out, rc = -EIO);
+        CDEBUG(D_INODE, "wrote trans "LPU64" client %s at #%u: written = "
+               LPSZ"\n", transno, mcd->mcd_uuid, med->med_off, written);
+
+        if (written != sizeof(*mcd)) {
+                CERROR("error writing to last_rcvd: rc = "LPSZ"\n", written);
+                if (rc == 0) {
+                        if (written < 0)
+                                rc = written;
+                        else
+                                rc = -EIO;
+                }
+        }
 
-        rc = 0;
+        err = fsfilt_commit(obd, i, handle);
+        if (err) {
+                CERROR("error committing transaction: %d\n", err);
+                if (!rc)
+                        rc = err;
+        }
 
         EXIT;
  out:
-        up(&mds->mds_transno_sem);
         return rc;
 }
 
-/* In the write-back case, the client holds a lock on a subtree (not supported).
- * In the intent case, the client holds a lock on the child inode. */
+/* this gives the same functionality as the code between
+ * sys_chmod and inode_setattr
+ * chown_common and inode_setattr
+ * utimes and inode_setattr
+ */
+int mds_fix_attr(struct inode *inode, struct mds_update_record *rec)
+{
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        time_t now = CURRENT_TIME;
+#else
+        time_t now = CURRENT_TIME.tv_sec;
+#endif
+        struct iattr *attr = &rec->ur_iattr;
+        unsigned int ia_valid = attr->ia_valid;
+        int error;
+        ENTRY;
+
+        /* only fix up attrs if the client VFS didn't already */
+        if (!(ia_valid & ATTR_RAW))
+                RETURN(0);
+
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                RETURN(-EPERM);
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        attr->ia_ctime = now;
+        if (!(ia_valid & ATTR_ATIME_SET))
+                attr->ia_atime = now;
+        if (!(ia_valid & ATTR_MTIME_SET))
+                attr->ia_mtime = now;
+#else
+        attr->ia_ctime.tv_sec = now;
+        if (!(ia_valid & ATTR_ATIME_SET))
+                attr->ia_atime.tv_sec = now;
+        if (!(ia_valid & ATTR_MTIME_SET))
+                attr->ia_mtime.tv_sec = now;
+#endif
+
+        /* times */
+        if ((ia_valid & (ATTR_MTIME|ATTR_ATIME))==(ATTR_MTIME|ATTR_ATIME) &&
+             !(ia_valid & ATTR_ATIME_SET)) {
+                if (rec->ur_fsuid != inode->i_uid &&
+                    (error = permission(inode,MAY_WRITE)) != 0)
+                        RETURN(error);
+        } else if (ia_valid & ATTR_UID) {
+                /* chown */
+                error = -EPERM;
+                if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                        RETURN(-EPERM);
+                if (attr->ia_uid == (uid_t) -1)
+                        attr->ia_uid = inode->i_uid;
+                if (attr->ia_gid == (gid_t) -1)
+                        attr->ia_gid = inode->i_gid;
+                attr->ia_mode = inode->i_mode;
+                attr->ia_valid =  ATTR_UID | ATTR_GID | ATTR_CTIME;
+                /*
+                 * If the user or group of a non-directory has been
+                 * changed by a non-root user, remove the setuid bit.
+                 * 19981026 David C Niemi <niemi@tux.org>
+                 *
+                 * Changed this to apply to all users, including root,
+                 * to avoid some races. This is the behavior we had in
+                 * 2.0. The check for non-root was definitely wrong
+                 * for 2.2 anyway, as it should have been using
+                 * CAP_FSETID rather than fsuid -- 19990830 SD.
+                 */
+                if ((inode->i_mode & S_ISUID) == S_ISUID &&
+                    !S_ISDIR(inode->i_mode)) {
+                        attr->ia_mode &= ~S_ISUID;
+                        attr->ia_valid |= ATTR_MODE;
+                }
+                /*
+                 * Likewise, if the user or group of a non-directory
+                 * has been changed by a non-root user, remove the
+                 * setgid bit UNLESS there is no group execute bit
+                 * (this would be a file marked for mandatory
+                 * locking).  19981026 David C Niemi <niemi@tux.org>
+                 *
+                 * Removed the fsuid check (see the comment above) --
+                 * 19990830 SD.
+                 */
+                if (((inode->i_mode & (S_ISGID | S_IXGRP)) ==
+                     (S_ISGID | S_IXGRP)) && !S_ISDIR(inode->i_mode)) {
+                        attr->ia_mode &= ~S_ISGID;
+                        attr->ia_valid |= ATTR_MODE;
+                }
+        } else if (ia_valid & ATTR_MODE) {
+                int mode = attr->ia_mode;
+                /* chmod */
+                if (attr->ia_mode == (mode_t) -1)
+                        attr->ia_mode = inode->i_mode;
+                attr->ia_mode =
+                        (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+        }
+        RETURN(0);
+}
+
+static void reconstruct_reint_setattr(struct mds_update_record *rec,
+                                      int offset, struct ptlrpc_request *req)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct mds_client_data *mcd = med->med_mcd;
+        struct mds_obd *obd = &req->rq_export->exp_obd->u.mds;
+        struct dentry *de;
+        struct mds_body *body;
+
+        req->rq_transno = mcd->mcd_last_transno;
+        req->rq_status = mcd->mcd_last_result;
+
+        if (med->med_outstanding_reply)
+                mds_steal_ack_locks(med, req);
+
+        de = mds_fid2dentry(obd, rec->ur_fid1, NULL);
+        if (IS_ERR(de)) {
+                LASSERT(PTR_ERR(de) == req->rq_status);
+                return;
+        }
+
+        body = lustre_msg_buf(req->rq_repmsg, 0);
+        mds_pack_inode2fid(&body->fid1, de->d_inode);
+        mds_pack_inode2body(body, de->d_inode);
+
+        l_dput(de);
+}
+
+/* In the raw-setattr case, we lock the child inode.
+ * In the write-back case or if being called from open, the client holds a lock
+ * already.
+ *
+ * We use the ATTR_FROM_OPEN flag to tell these cases apart. */
 static int mds_reint_setattr(struct mds_update_record *rec, int offset,
                              struct ptlrpc_request *req,
                              struct lustre_handle *lh)
@@ -110,29 +256,46 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
         struct obd_device *obd = req->rq_export->exp_obd;
         struct mds_body *body;
         struct dentry *de;
-        struct inode *inode;
-        void *handle;
-        int rc = 0, err;
+        struct inode *inode = NULL;
+        struct lustre_handle lockh;
+        void *handle = NULL;
+        int rc = 0, cleanup_phase = 0, err, locked = 0;
+        ENTRY;
 
-        de = mds_fid2dentry(mds, rec->ur_fid1, NULL);
-        if (IS_ERR(de))
-                GOTO(out_setattr, rc = PTR_ERR(de));
-        inode = de->d_inode;
+        MDS_CHECK_RESENT(req, reconstruct_reint_setattr(rec, offset, req));
+
+        if (rec->ur_iattr.ia_valid & ATTR_FROM_OPEN) {
+                de = mds_fid2dentry(mds, rec->ur_fid1, NULL);
+                if (IS_ERR(de))
+                        GOTO(cleanup, rc = PTR_ERR(de));
+        } else {
+                de = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_PW,
+                                           &lockh);
+                if (IS_ERR(de))
+                        GOTO(cleanup, rc = PTR_ERR(de));
+                locked = 1;
+        }
 
+        cleanup_phase = 1;
+        inode = de->d_inode;
         LASSERT(inode);
+
         CDEBUG(D_INODE, "ino %lu\n", inode->i_ino);
 
         OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_SETATTR_WRITE,
                        to_kdev_t(inode->i_sb->s_dev));
 
-        mds_start_transno(mds);
         handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR);
         if (IS_ERR(handle)) {
                 rc = PTR_ERR(handle);
-                (void)mds_finish_transno(mds, handle, req, rc);
-                GOTO(out_setattr_de, rc);
+                handle = NULL;
+                GOTO(cleanup, rc);
         }
 
+        rc = mds_fix_attr(inode, rec);
+        if (rc)
+                GOTO(cleanup, rc);
+
         rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr);
         if (rc == 0 && S_ISREG(inode->i_mode) &&
             req->rq_reqmsg->bufcount > 1) {
@@ -145,22 +308,62 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
         mds_pack_inode2fid(&body->fid1, inode);
         mds_pack_inode2body(body, inode);
 
-        rc = mds_finish_transno(mds, handle, req, rc);
-        err = fsfilt_commit(obd, de->d_inode, handle);
-        if (err) {
-                CERROR("error on commit: err = %d\n", err);
-                if (!rc)
-                        rc = err;
+        EXIT;
+ cleanup:
+        err = mds_finish_transno(mds, inode, handle, req, rc, 0);
+        switch(cleanup_phase) {
+        case 1:
+                l_dput(de);
+                if (locked) {
+                        if (rc) {
+                                ldlm_lock_decref(&lockh, LCK_PW);
+                        } else {
+                                memcpy(&req->rq_ack_locks[0].lock, &lockh,
+                                       sizeof(lockh));
+                                req->rq_ack_locks[0].mode = LCK_PW;
+                        }
+                }
+        case 0:
+                break;
+        default:
+                LBUG();
         }
+        if (err && !rc)
+                rc = err;
 
-        EXIT;
-out_setattr_de:
-        l_dput(de);
-out_setattr:
         req->rq_status = rc;
         return 0;
 }
 
+static void reconstruct_reint_create(struct mds_update_record *rec, int offset,
+                                     struct ptlrpc_request *req)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct mds_client_data *mcd = med->med_mcd;
+        struct mds_obd *obd = &req->rq_export->exp_obd->u.mds;
+        struct dentry *parent, *child;
+        struct mds_body *body;
+        
+        req->rq_transno = mcd->mcd_last_transno;
+        req->rq_status = mcd->mcd_last_result;
+
+        if (med->med_outstanding_reply)
+                mds_steal_ack_locks(med, req);
+        
+        if (req->rq_status)
+                return;
+
+        parent = mds_fid2dentry(obd, rec->ur_fid1, NULL);
+        LASSERT(!IS_ERR(parent));
+        child = lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1);
+        LASSERT(!IS_ERR(child));
+        body = lustre_msg_buf(req->rq_repmsg, offset);
+        mds_pack_inode2fid(&body->fid1, child->d_inode);
+        mds_pack_inode2body(body, child->d_inode);
+        l_dput(parent);
+        l_dput(child);
+}
+
 static int mds_reint_create(struct mds_update_record *rec, int offset,
                             struct ptlrpc_request *req,
                             struct lustre_handle *lh)
@@ -169,25 +372,28 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
         struct mds_obd *mds = mds_req2mds(req);
         struct obd_device *obd = req->rq_export->exp_obd;
         struct dentry *dchild = NULL;
-        struct inode *dir;
-        void *handle;
+        struct inode *dir = NULL;
+        void *handle = NULL;
         struct lustre_handle lockh;
-        int rc = 0, err, type = rec->ur_mode & S_IFMT;
+        int rc = 0, err, type = rec->ur_mode & S_IFMT, cleanup_phase = 0;
+        int created = 0;
         ENTRY;
 
         LASSERT(offset == 0);
         LASSERT(!strcmp(req->rq_export->exp_obd->obd_type->typ_name, "mds"));
 
+        MDS_CHECK_RESENT(req, reconstruct_reint_create(rec, offset, req));
+
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_CREATE))
-                GOTO(out_create, rc = -ESTALE);
+                GOTO(cleanup, rc = -ESTALE);
 
         de = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_PW, &lockh);
         if (IS_ERR(de)) {
                 rc = PTR_ERR(de);
                 CERROR("parent lookup error %d\n", rc);
-                LBUG();
-                GOTO(out_create, rc);
+                GOTO(cleanup, rc);
         }
+        cleanup_phase = 1; /* locked parent dentry */
         dir = de->d_inode;
         LASSERT(dir);
         CDEBUG(D_INODE, "parent ino %lu creating name %s mode %o\n",
@@ -199,9 +405,11 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
         if (IS_ERR(dchild)) {
                 rc = PTR_ERR(dchild);
                 CERROR("child lookup error %d\n", rc);
-                GOTO(out_create_de, rc);
+                GOTO(cleanup, rc);
         }
 
+        cleanup_phase = 2; /* child dentry */
+
         OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_CREATE_WRITE,
                        to_kdev_t(dir->i_sb->s_dev));
 
@@ -216,18 +424,11 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
         else
                 LASSERT(!(rec->ur_opcode & REINT_REPLAYING));
 
-        /* From here on, we must exit via a path that calls mds_finish_transno,
-         * so that we release the mds_transno_sem (and, in the case of success,
-         * update the transno correctly).  out_create_commit and
-         * out_transno_dchild are good candidates.
-         */
-        mds_start_transno(mds);
-
         switch (type) {
         case S_IFREG:{
                 handle = fsfilt_start(obd, dir, FSFILT_OP_CREATE);
                 if (IS_ERR(handle))
-                        GOTO(out_transno_dchild, rc = PTR_ERR(handle));
+                        GOTO(cleanup, rc = PTR_ERR(handle));
                 rc = vfs_create(dir, dchild, rec->ur_mode);
                 EXIT;
                 break;
@@ -235,7 +436,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
         case S_IFDIR:{
                 handle = fsfilt_start(obd, dir, FSFILT_OP_MKDIR);
                 if (IS_ERR(handle))
-                        GOTO(out_transno_dchild, rc = PTR_ERR(handle));
+                        GOTO(cleanup, rc = PTR_ERR(handle));
                 rc = vfs_mkdir(dir, dchild, rec->ur_mode);
                 EXIT;
                 break;
@@ -243,7 +444,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
         case S_IFLNK:{
                 handle = fsfilt_start(obd, dir, FSFILT_OP_SYMLINK);
                 if (IS_ERR(handle))
-                        GOTO(out_transno_dchild, rc = PTR_ERR(handle));
+                        GOTO(cleanup, rc = PTR_ERR(handle));
                 rc = vfs_symlink(dir, dchild, rec->ur_tgt);
                 EXIT;
                 break;
@@ -255,33 +456,39 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                 int rdev = rec->ur_rdev;
                 handle = fsfilt_start(obd, dir, FSFILT_OP_MKNOD);
                 if (IS_ERR(handle))
-                        GOTO(out_transno_dchild, rc = PTR_ERR(handle));
+                        GOTO(cleanup, (handle = NULL, rc = PTR_ERR(handle)));
                 rc = vfs_mknod(dir, dchild, rec->ur_mode, rdev);
                 EXIT;
                 break;
         }
         default:
                 CERROR("bad file type %o creating %s\n", type, rec->ur_name);
-                handle = NULL; /* quell uninitialized warning */
-                GOTO(out_transno_dchild, rc = -EINVAL);
+                GOTO(cleanup, rc = -EINVAL);
         }
 
         /* In case we stored the desired inum in here, we want to clean up.
-         * We also do this in the out_transno_dchild block, for the error cases.
+         * We also do this in the cleanup block, for the error cases.
          */
         dchild->d_fsdata = NULL;
 
         if (rc) {
                 CDEBUG(D_INODE, "error during create: %d\n", rc);
-                GOTO(out_create_commit, rc);
+                GOTO(cleanup, rc);
         } else {
                 struct iattr iattr;
                 struct inode *inode = dchild->d_inode;
                 struct mds_body *body;
 
+                created = 1;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
                 iattr.ia_atime = rec->ur_time;
                 iattr.ia_ctime = rec->ur_time;
                 iattr.ia_mtime = rec->ur_time;
+#else
+                iattr.ia_atime.tv_sec = rec->ur_time;
+                iattr.ia_ctime.tv_sec = rec->ur_time;
+                iattr.ia_mtime.tv_sec = rec->ur_time;
+#endif
                 iattr.ia_uid = rec->ur_uid;
                 iattr.ia_gid = rec->ur_gid;
                 iattr.ia_valid = ATTR_UID | ATTR_GID | ATTR_ATIME |
@@ -309,55 +516,52 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                 mds_pack_inode2body(body, inode);
         }
         EXIT;
-out_create_commit:
-        if (rc) {
-                rc = mds_finish_transno(mds, handle, req, rc);
+
+cleanup:
+        err = mds_finish_transno(mds, dir, handle, req, rc, 0);
+                
+        if (rc && created) {
+                /* Destroy the file we just created.  This should not need
+                 * extra journal credits, as we have already modified all of
+                 * the blocks needed in order to create the file in the first
+                 * place.
+                 */
+                switch (type) {
+                case S_IFDIR:
+                        err = vfs_rmdir(dir, dchild);
+                        if (err)
+                                CERROR("rmdir in error path: %d\n", err);
+                        break;
+                default:
+                        err = vfs_unlink(dir, dchild);
+                        if (err)
+                                CERROR("unlink in error path: %d\n", err);
+                        break;
+                }
         } else {
-                rc = mds_finish_transno(mds, handle, req, rc);
-                if (rc)
-                        GOTO(out_create_unlink, rc);
+                rc = err;
         }
-        err = fsfilt_commit(obd, dir, handle);
-        if (err) {
-                CERROR("error on commit: err = %d\n", err);
-                if (!rc)
-                        rc = err;
-        }
-out_create_dchild:
-        l_dput(dchild);
-out_create_de:
-        ldlm_lock_decref(&lockh, LCK_PW);
-        l_dput(de);
-out_create:
-        req->rq_status = rc;
-        return 0;
-
-out_transno_dchild:
-        dchild->d_fsdata = NULL;
-        /* Need to release the transno lock, and then put the dchild. */
-        LASSERT(rc);
-        mds_finish_transno(mds, handle, req, rc);
-        goto out_create_dchild;
-
-out_create_unlink:
-        /* Destroy the file we just created.  This should not need extra
-         * journal credits, as we have already modified all of the blocks
-         * needed in order to create the file in the first place.
-         */
-        switch (type) {
-        case S_IFDIR:
-                err = vfs_rmdir(dir, dchild);
-                if (err)
-                        CERROR("failed rmdir in error path: rc = %d\n", err);
+        switch (cleanup_phase) {
+        case 2: /* child dentry */
+                dchild->d_fsdata = NULL;
+                l_dput(dchild);
+        case 1: /* locked parent dentry */
+                if (rc) {
+                        ldlm_lock_decref(&lockh, LCK_PW);
+                } else {
+                        memcpy(&req->rq_ack_locks[0].lock, &lockh,
+                               sizeof(lockh));
+                        req->rq_ack_locks[0].mode = LCK_PW;
+                }
+                l_dput(de);
+        case 0:
                 break;
         default:
-                err = vfs_unlink(dir, dchild);
-                if (err)
-                        CERROR("failed unlink in error path: rc = %d\n", err);
-                break;
+                CERROR("invalid cleanup_phase %d\n", cleanup_phase);
+                LBUG();
         }
-
-        goto out_create_commit;
+        req->rq_status = rc;
+        return 0;
 }
 
 /* This function doesn't use ldlm_match_or_enqueue because we're always called
@@ -424,6 +628,23 @@ int enqueue_ordered_locks(int lock_mode, struct obd_device *obd,
         RETURN(0);
 }
 
+static void reconstruct_reint_unlink(struct mds_update_record *rec, int offset,
+                                    struct ptlrpc_request *req,
+                                    struct lustre_handle *child_lockh)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct mds_client_data *mcd = med->med_mcd;
+
+        req->rq_transno = mcd->mcd_last_transno;
+        req->rq_status = mcd->mcd_last_result;
+
+        if (med->med_outstanding_reply)
+                mds_steal_ack_locks(med, req);
+        
+        DEBUG_REQ(D_ERROR, req,
+                  "can't get EA for reconstructed unlink, leaking OST inodes");
+}
+
 static int mds_reint_unlink(struct mds_update_record *rec, int offset,
                             struct ptlrpc_request *req,
                             struct lustre_handle *child_lockh)
@@ -433,31 +654,41 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
         struct mds_obd *mds = mds_req2mds(req);
         struct obd_device *obd = req->rq_export->exp_obd;
         struct mds_body *body = NULL;
-        struct inode *dir_inode, *child_inode;
-        struct lustre_handle *handle, parent_lockh;
+        struct inode *dir_inode = NULL, *child_inode;
+        struct lustre_handle parent_lockh;
+        void *handle = NULL;
         struct ldlm_res_id child_res_id = { .name = {0} };
         char *name;
-        int namelen, err, rc = 0, flags = 0, return_lock = 0;
+        int namelen, rc = 0, flags = 0, return_lock = 0;
+        int cleanup_phase = 0;
         ENTRY;
 
+        MDS_CHECK_RESENT(req, reconstruct_reint_unlink(rec, offset, req, 
+                                                       child_lockh));
+
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNLINK))
-                GOTO(out, rc = -ENOENT);
+                GOTO(cleanup, rc = -ENOENT);
 
         /* Step 1: Lookup the parent by FID */
         dir_de = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_PW,
                                        &parent_lockh);
         if (IS_ERR(dir_de))
-                GOTO(out, rc = PTR_ERR(dir_de));
+                GOTO(cleanup, rc = PTR_ERR(dir_de));
         dir_inode = dir_de->d_inode;
         LASSERT(dir_inode);
 
+        cleanup_phase = 1; /* Have parent dentry lock */
+
         /* Step 2: Lookup the child */
         name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
         namelen = req->rq_reqmsg->buflens[offset + 1] - 1;
 
         dchild = lookup_one_len(name, dir_de, namelen);
         if (IS_ERR(dchild))
-                GOTO(out_step_2a, rc = PTR_ERR(dchild));
+                GOTO(cleanup, rc = PTR_ERR(dchild));
+        
+        cleanup_phase = 2; /* child dentry */
+
         child_inode = dchild->d_inode;
         if (child_inode == NULL) {
                 if (rec->ur_opcode & REINT_REPLAYING) {
@@ -471,13 +702,13 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
                                dir_inode->i_ino, rec->ur_name);
                         rc = -ENOENT;
                 }
-                GOTO(out_step_2b, rc);
+                GOTO(cleanup, rc);
         }
 
         DEBUG_REQ(D_INODE, req, "parent ino %lu, child ino %lu",
                   dir_inode->i_ino, child_inode->i_ino);
 
-        /* Step 3: Get lock a lock on the child */
+        /* Step 3: Get a lock on the child */
         child_res_id.name[0] = child_inode->i_ino;
         child_res_id.name[1] = child_inode->i_generation;
 
@@ -486,7 +717,9 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
                               &flags, ldlm_completion_ast, mds_blocking_ast,
                               NULL, NULL, child_lockh);
         if (rc != ELDLM_OK)
-                GOTO(out_step_2b, rc);
+                GOTO(cleanup, rc);
+
+        cleanup_phase = 3; /* child lock */
 
         OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_UNLINK_WRITE,
                        to_kdev_t(dir_inode->i_sb->s_dev));
@@ -499,12 +732,11 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
 
         /* Step 4: Do the unlink: client decides between rmdir/unlink!
          * (bug 72) */
-        mds_start_transno(mds);
         switch (rec->ur_mode & S_IFMT) {
         case S_IFDIR:
                 handle = fsfilt_start(obd, dir_inode, FSFILT_OP_RMDIR);
                 if (IS_ERR(handle))
-                        GOTO(out_cancel_transno, rc = PTR_ERR(handle));
+                        GOTO(cleanup, rc = PTR_ERR(handle));
                 rc = vfs_rmdir(dir_inode, dchild);
                 break;
         case S_IFREG:
@@ -527,71 +759,93 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
         case S_IFSOCK:
                 handle = fsfilt_start(obd, dir_inode, FSFILT_OP_UNLINK);
                 if (IS_ERR(handle))
-                        GOTO(out_cancel_transno, rc = PTR_ERR(handle));
+                        GOTO(cleanup, rc = PTR_ERR(handle));
                 rc = vfs_unlink(dir_inode, dchild);
                 break;
         default:
                 CERROR("bad file type %o unlinking %s\n", rec->ur_mode, name);
-                handle = NULL;
                 LBUG();
-                GOTO(out_cancel_transno, rc = -EINVAL);
+                GOTO(cleanup, rc = -EINVAL);
         }
 
-        rc = mds_finish_transno(mds, handle, req, rc);
-        err = fsfilt_commit(obd, dir_inode, handle);
-        if (rc != 0 || err != 0) {
+ cleanup:
+        rc = mds_finish_transno(mds, dir_inode, handle, req, rc, 0);
+        if (rc && body) {
                 /* Don't unlink the OST objects if the MDS unlink failed */
                 body->valid = 0;
         }
-        if (err) {
-                CERROR("error on commit: err = %d\n", err);
-                if (!rc)
-                        rc = err;
+        switch(cleanup_phase) {
+            case 3: /* child lock */
+                if (rc != 0 || return_lock == 0)
+                        ldlm_lock_decref(child_lockh, LCK_EX);
+            case 2: /* child dentry */
+                l_dput(dchild);
+            case 1: /* parent dentry and lock */
+                if (rc) {
+                        ldlm_lock_decref(&parent_lockh, LCK_EX);
+                } else {
+                        memcpy(&req->rq_ack_locks[0].lock, &parent_lockh,
+                               sizeof(parent_lockh));
+                        req->rq_ack_locks[0].mode = LCK_EX;
+                }
+                l_dput(dir_de);
+            case 0:
+                break;
+            default:
+                CERROR("invalid cleanup_phase %d\n", cleanup_phase);
+                LBUG();
         }
-
-        GOTO(out_step_4, rc);
- out_step_4:
-        if (rc != 0 || return_lock == 0)
-                ldlm_lock_decref(child_lockh, LCK_EX);
- out_step_2b:
-        l_dput(dchild);
- out_step_2a:
-        ldlm_lock_decref(&parent_lockh, LCK_EX);
-        l_dput(dir_de);
- out:
         req->rq_status = rc;
         return 0;
+}
+
+static void reconstruct_reint_link(struct mds_update_record *rec, int offset,
+                                   struct ptlrpc_request *req)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct mds_client_data *mcd = med->med_mcd;
 
- out_cancel_transno:
-        rc = mds_finish_transno(mds, handle, req, rc);
-        goto out_step_4;
+        req->rq_transno = mcd->mcd_last_transno;
+        req->rq_status = mcd->mcd_last_result;
+        
+        if (med->med_outstanding_reply)
+                mds_steal_ack_locks(med, req);
+        else
+                LBUG(); /* don't support it yet, but it'll be fun! */
 }
 
 static int mds_reint_link(struct mds_update_record *rec, int offset,
-                          struct ptlrpc_request *req, struct lustre_handle *lh)
+                          struct ptlrpc_request *req,
+                          struct lustre_handle *lh)
 {
         struct obd_device *obd = req->rq_export->exp_obd;
         struct dentry *de_src = NULL;
         struct dentry *de_tgt_dir = NULL;
         struct dentry *dchild = NULL;
         struct mds_obd *mds = mds_req2mds(req);
-        struct lustre_handle *handle, tgt_dir_lockh, src_lockh;
+        struct lustre_handle *handle = NULL, tgt_dir_lockh, src_lockh;
         struct ldlm_res_id src_res_id = { .name = {0} };
         struct ldlm_res_id tgt_dir_res_id = { .name = {0} };
-        int lock_mode, rc = 0, err;
+        int lock_mode = 0, rc = 0, cleanup_phase = 0;
         ENTRY;
 
+        MDS_CHECK_RESENT(req, reconstruct_reint_link(rec, offset, req));
+
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_LINK))
-                GOTO(out, rc = -ENOENT);
+                GOTO(cleanup, rc = -ENOENT);
 
         /* Step 1: Lookup the source inode and target directory by FID */
         de_src = mds_fid2dentry(mds, rec->ur_fid1, NULL);
         if (IS_ERR(de_src))
-                GOTO(out, rc = PTR_ERR(de_src));
+                GOTO(cleanup, rc = PTR_ERR(de_src));
+
+        cleanup_phase = 1; /* source dentry */
 
         de_tgt_dir = mds_fid2dentry(mds, rec->ur_fid2, NULL);
         if (IS_ERR(de_tgt_dir))
-                GOTO(out_de_src, rc = PTR_ERR(de_tgt_dir));
+                GOTO(cleanup, rc = PTR_ERR(de_tgt_dir));
+
+        cleanup_phase = 2; /* target directory dentry */
 
         CDEBUG(D_INODE, "linking %*s/%s to inode %lu\n",
                de_tgt_dir->d_name.len, de_tgt_dir->d_name.name, rec->ur_name,
@@ -607,15 +861,19 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
         rc = enqueue_ordered_locks(LCK_EX, obd, &src_res_id, &tgt_dir_res_id,
                                    &src_lockh, &tgt_dir_lockh);
         if (rc != ELDLM_OK)
-                GOTO(out_tgt_dir, rc = -EIO);
+                GOTO(cleanup, rc = -EIO);
+
+        cleanup_phase = 3; /* locks */
 
         /* Step 3: Lookup the child */
         dchild = lookup_one_len(rec->ur_name, de_tgt_dir, rec->ur_namelen - 1);
         if (IS_ERR(dchild)) {
                 CERROR("child lookup error %ld\n", PTR_ERR(dchild));
-                GOTO(out_drop_locks, rc = PTR_ERR(dchild));
+                GOTO(cleanup, rc = PTR_ERR(dchild));
         }
 
+        cleanup_phase = 4; /* child dentry */
+
         if (dchild->d_inode) {
                 if (rec->ur_opcode & REINT_REPLAYING) {
                         /* XXX verify that the link is to the the right file? */
@@ -628,49 +886,72 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
                                de_tgt_dir->d_inode->i_ino, rec->ur_name);
                         rc = -EEXIST;
                 }
-                GOTO(out_drop_child, rc);
+                GOTO(cleanup, rc);
         }
 
         /* Step 4: Do it. */
         OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_LINK_WRITE,
                        to_kdev_t(de_src->d_inode->i_sb->s_dev));
 
-        mds_start_transno(mds);
         handle = fsfilt_start(obd, de_tgt_dir->d_inode, FSFILT_OP_LINK);
         if (IS_ERR(handle)) {
                 rc = PTR_ERR(handle);
-                mds_finish_transno(mds, handle, req, rc);
-                GOTO(out_drop_child, rc);
+                GOTO(cleanup, rc);
         }
 
         rc = vfs_link(de_src, de_tgt_dir->d_inode, dchild);
         if (rc)
                 CERROR("link error %d\n", rc);
-        rc = mds_finish_transno(mds, handle, req, rc);
-
-        err = fsfilt_commit(obd, de_tgt_dir->d_inode, handle);
-        if (err) {
-                CERROR("error on commit: err = %d\n", err);
-                if (!rc)
-                        rc = err;
-        }
-
+cleanup:
+        rc = mds_finish_transno(mds, de_tgt_dir ? de_tgt_dir->d_inode : NULL,
+                                handle, req, rc, 0);
         EXIT;
 
-out_drop_child:
-        l_dput(dchild);
-out_drop_locks:
-        ldlm_lock_decref(&src_lockh, lock_mode);
-        ldlm_lock_decref(&tgt_dir_lockh, lock_mode);
-out_tgt_dir:
-        l_dput(de_tgt_dir);
-out_de_src:
-        l_dput(de_src);
-out:
+        switch (cleanup_phase) {
+        case 4: /* child dentry */
+                l_dput(dchild);
+        case 3: /* locks */
+                if (rc) {
+                        ldlm_lock_decref(&src_lockh, lock_mode);
+                        ldlm_lock_decref(&tgt_dir_lockh, lock_mode);
+                } else {
+                        memcpy(&req->rq_ack_locks[0].lock, &src_lockh,
+                               sizeof(src_lockh));
+                        memcpy(&req->rq_ack_locks[1].lock, &tgt_dir_lockh,
+                               sizeof(tgt_dir_lockh));
+                        req->rq_ack_locks[0].mode = lock_mode;
+                        req->rq_ack_locks[1].mode = lock_mode;
+                }
+        case 2: /* target dentry */
+                l_dput(de_tgt_dir);
+        case 1: /* source dentry */
+                l_dput(de_src);
+        case 0:
+                break;
+        default:
+                CERROR("invalid cleanup_phase %d\n", cleanup_phase);
+                LBUG();
+        }
         req->rq_status = rc;
         return 0;
 }
 
+static void reconstruct_reint_rename(struct mds_update_record *rec,
+                                     int offset, struct ptlrpc_request *req)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct mds_client_data *mcd = med->med_mcd;
+
+        req->rq_transno = mcd->mcd_last_transno;
+        req->rq_status = mcd->mcd_last_result;
+        
+        if (med->med_outstanding_reply)
+                mds_steal_ack_locks(med, req);
+        else
+                LBUG(); /* don't support it yet, but it'll be fun! */
+
+}
+
 static int mds_reint_rename(struct mds_update_record *rec, int offset,
                             struct ptlrpc_request *req,
                             struct lustre_handle *lockh)
@@ -686,16 +967,24 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
         struct ldlm_res_id p2_res_id = { .name = {0} };
         struct ldlm_res_id c1_res_id = { .name = {0} };
         struct ldlm_res_id c2_res_id = { .name = {0} };
-        int rc = 0, err, lock_count = 3, flags = LDLM_FL_LOCAL_ONLY;
-        void *handle;
+        int rc = 0, lock_count = 3, flags = LDLM_FL_LOCAL_ONLY;
+        int cleanup_phase = 0;
+        void *handle = NULL;
         ENTRY;
 
+        MDS_CHECK_RESENT(req, reconstruct_reint_rename(rec, offset, req));
+
         de_srcdir = mds_fid2dentry(mds, rec->ur_fid1, NULL);
         if (IS_ERR(de_srcdir))
-                GOTO(out, rc = PTR_ERR(de_srcdir));
+                GOTO(cleanup, rc = PTR_ERR(de_srcdir));
+        
+        cleanup_phase = 1; /* source directory dentry */
+
         de_tgtdir = mds_fid2dentry(mds, rec->ur_fid2, NULL);
         if (IS_ERR(de_tgtdir))
-                GOTO(out_put_srcdir, rc = PTR_ERR(de_tgtdir));
+                GOTO(cleanup, rc = PTR_ERR(de_tgtdir));
+
+        cleanup_phase = 2; /* target directory dentry */
 
         /* The idea here is that we need to get four locks in the end:
          * one on each parent directory, one on each child.  We need to take
@@ -720,26 +1009,43 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
         rc = enqueue_ordered_locks(LCK_EX, obd, &p1_res_id, &p2_res_id,
                                    &(dlm_handles[0]), &(dlm_handles[1]));
         if (rc != ELDLM_OK)
-                GOTO(out_put_tgtdir, rc);
+                GOTO(cleanup, rc);
+
+        cleanup_phase = 3; /* parent locks */
 
         /* Step 2: Lookup the children */
         de_old = lookup_one_len(rec->ur_name, de_srcdir, rec->ur_namelen - 1);
         if (IS_ERR(de_old)) {
                 CERROR("old child lookup error (%*s): %ld\n",
                        rec->ur_namelen - 1, rec->ur_name, PTR_ERR(de_old));
-                GOTO(out_step_2a, rc = PTR_ERR(de_old));
+                GOTO(cleanup, rc = PTR_ERR(de_old));
         }
 
+        cleanup_phase = 4; /* original name dentry */
+
         if (de_old->d_inode == NULL)
-                GOTO(out_step_2b, rc = -ENOENT);
+                GOTO(cleanup, rc = -ENOENT);
+
+        /* sanity check for src inode */
+        if (de_old->d_inode->i_ino == de_srcdir->d_inode->i_ino ||
+            de_old->d_inode->i_ino == de_tgtdir->d_inode->i_ino)
+                GOTO(cleanup, rc = -EINVAL);
 
         de_new = lookup_one_len(rec->ur_tgt, de_tgtdir, rec->ur_tgtlen - 1);
         if (IS_ERR(de_new)) {
                 CERROR("new child lookup error (%*s): %ld\n",
                        rec->ur_tgtlen - 1, rec->ur_tgt, PTR_ERR(de_new));
-                GOTO(out_step_2b, rc = PTR_ERR(de_new));
+                GOTO(cleanup, rc = PTR_ERR(de_new));
         }
 
+        cleanup_phase = 5; /* target dentry */
+
+        /* sanity check for dest inode */
+        if (de_new->d_inode &&
+            (de_new->d_inode->i_ino == de_srcdir->d_inode->i_ino ||
+            de_new->d_inode->i_ino == de_tgtdir->d_inode->i_ino))
+                GOTO(cleanup, rc = -EINVAL);
+
         /* Step 3: Take locks on the children */
         c1_res_id.name[0] = de_old->d_inode->i_ino;
         c1_res_id.name[1] = de_old->d_inode->i_generation;
@@ -760,51 +1066,69 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
                 lock_count = 4;
         }
         if (rc != ELDLM_OK)
-                GOTO(out_step_3, rc);
+                GOTO(cleanup, rc);
+
+        cleanup_phase = 6; /* child locks */
 
         /* Step 4: Execute the rename */
         OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_RENAME_WRITE,
                        to_kdev_t(de_srcdir->d_inode->i_sb->s_dev));
 
-        mds_start_transno(mds);
         handle = fsfilt_start(obd, de_tgtdir->d_inode, FSFILT_OP_RENAME);
-        if (IS_ERR(handle)) {
-                rc = PTR_ERR(handle);
-                mds_finish_transno(mds, handle, req, rc);
-                GOTO(out_step_4, rc);
-        }
+        if (IS_ERR(handle))
+                GOTO(cleanup, rc = PTR_ERR(handle));
 
         lock_kernel();
         rc = vfs_rename(de_srcdir->d_inode, de_old, de_tgtdir->d_inode, de_new,
                         NULL);
         unlock_kernel();
 
-        rc = mds_finish_transno(mds, handle, req, rc);
-
-        err = fsfilt_commit(obd, de_tgtdir->d_inode, handle);
-        if (err) {
-                CERROR("error on commit: err = %d\n", err);
-                if (!rc)
-                        rc = err;
-        }
-
         EXIT;
- out_step_4:
-        ldlm_lock_decref(&(dlm_handles[2]), LCK_EX);
-        if (lock_count == 4)
-                ldlm_lock_decref(&(dlm_handles[3]), LCK_EX);
- out_step_3:
-        l_dput(de_new);
- out_step_2b:
-        l_dput(de_old);
- out_step_2a:
-        ldlm_lock_decref(&(dlm_handles[0]), LCK_EX);
-        ldlm_lock_decref(&(dlm_handles[1]), LCK_EX);
- out_put_tgtdir:
-        l_dput(de_tgtdir);
- out_put_srcdir:
-        l_dput(de_srcdir);
- out:
+cleanup:
+        rc = mds_finish_transno(mds, de_tgtdir ? de_tgtdir->d_inode : NULL,
+                                handle, req, rc, 0);
+        switch (cleanup_phase) {
+        case 6: /* child locks */
+                if (rc) {
+                        ldlm_lock_decref(&(dlm_handles[2]), LCK_EX);
+                        if (lock_count == 4)
+                                ldlm_lock_decref(&(dlm_handles[3]), LCK_EX);
+                } else {
+                        memcpy(&req->rq_ack_locks[2].lock, &(dlm_handles[2]),
+                               sizeof(dlm_handles[2]));
+                        req->rq_ack_locks[2].mode = LCK_EX;
+                        if (lock_count == 4) {
+                                memcpy(&req->rq_ack_locks[3].lock,
+                                       &dlm_handles[3], sizeof(dlm_handles[3]));
+                                req->rq_ack_locks[3].mode = LCK_EX;
+                        }
+                }
+        case 5: /* target dentry */
+                l_dput(de_new);
+        case 4: /* source dentry */
+                l_dput(de_old);
+        case 3: /* parent locks */
+                if (rc) {
+                        ldlm_lock_decref(&(dlm_handles[0]), LCK_EX);
+                        ldlm_lock_decref(&(dlm_handles[1]), LCK_EX);
+                } else {
+                        memcpy(&req->rq_ack_locks[0].lock, &(dlm_handles[0]),
+                               sizeof(dlm_handles[0]));
+                        memcpy(&req->rq_ack_locks[1].lock, &(dlm_handles[1]),
+                               sizeof(dlm_handles[1]));
+                        req->rq_ack_locks[0].mode = LCK_EX;
+                        req->rq_ack_locks[1].mode = LCK_EX;
+                }
+        case 2: /* target directory dentry */
+                l_dput(de_tgtdir);
+        case 1: /* source directry dentry */
+                l_dput(de_srcdir);
+        case 0:
+                break;
+        default:
+                CERROR("invalid cleanup_phase %d\n", cleanup_phase);
+                LBUG();
+        }
         req->rq_status = rc;
         return 0;
 }
@@ -840,7 +1164,8 @@ int mds_reint_rec(struct mds_update_record *rec, int offset,
         uc.ouc_fsuid = rec->ur_fsuid;
         uc.ouc_fsgid = rec->ur_fsgid;
         uc.ouc_cap = rec->ur_cap;
-        uc.ouc_suppgid = rec->ur_suppgid;
+        uc.ouc_suppgid1 = rec->ur_suppgid1;
+        uc.ouc_suppgid2 = rec->ur_suppgid2;
 
         push_ctxt(&saved, &mds->mds_ctxt, &uc);
         rc = reinters[realop] (rec, offset, req, lockh);
index 03f1e59..fb04cc1 100644 (file)
@@ -1,3 +1,4 @@
+
 # FIXME: we need to make it clear that obdclass.o depends on
 # lustre_build_version, or 'make -j2' breaks!
 DEFS=
@@ -9,17 +10,29 @@ else
 FSMOD = fsfilt_extN
 endif
 
+if LIBLUSTRE
+lib_LIBRARIES = liblustreclass.a
+liblustreclass_a_SOURCES = uuid.c statfs_pack.c genops.c debug.c class_obd.c lustre_handles.c lustre_peer.c lprocfs_status.c
+
+class_obd.o: lustre_version
+
+lustre_version:
+       echo '#define LUSTRE_VERSION 12' > $(top_builddir)/include/linux/lustre_build_version.h
+       echo '#define BUILD_VERSION "1"' >> $(top_builddir)/include/linux/lustre_build_version.h
+
+else
 modulefs_DATA = lustre_build_version obdclass.o $(FSMOD).o fsfilt_reiserfs.o
 EXTRA_PROGRAMS = obdclass $(FSMOD) fsfilt_reiserfs
 
-obdclass_SOURCES = debug.c genops.c class_obd.c sysctl.c uuid.c lprocfs_status.c
+obdclass_SOURCES = class_obd.c debug.c genops.c sysctl.c uuid.c lprocfs_status.c lustre_handles.c lustre_peer.c
 obdclass_SOURCES += fsfilt.c statfs_pack.c
+endif
 
 include $(top_srcdir)/Rules
 
 # XXX I'm sure there's some automake mv-if-different helper for this.
 lustre_build_version:
-       perl $(top_srcdir)/scripts/version_tag.pl $(top_srcdir) $(top_builddir)> tmpver
+       perl $(top_srcdir)/scripts/version_tag.pl $(top_srcdir) $(top_builddir) > tmpver
        cmp -z $(top_builddir)/include/linux/lustre_build_version.h tmpver \
                2> /dev/null &&                                            \
                $(RM) tmpver ||                                            \
index 5347251..6200acd 100644 (file)
@@ -24,7 +24,9 @@
  * infrastructure for managing object devices
  */
 
+#define DEBUG_SUBSYSTEM S_CLASS
 #define EXPORT_SYMTAB
+#ifdef __KERNEL__
 #include <linux/config.h> /* for CONFIG_PROC_FS */
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <asm/poll.h>
 #include <asm/uaccess.h>
 #include <linux/miscdevice.h>
+#else
 
-#define DEBUG_SUBSYSTEM S_CLASS
+# include <liblustre.h>
+
+#endif
 
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
@@ -193,13 +198,12 @@ static void forcibly_detach_exports(struct obd_device *obd)
         }
 }
 
-/* to control /dev/obd */
-static int obd_class_ioctl (struct inode * inode, struct file * filp,
-                            unsigned int cmd, unsigned long arg)
+
+int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
+                       unsigned long arg)
 {
         char *buf = NULL;
         struct obd_ioctl_data *data;
-        struct obd_class_user_state *ocus = filp->private_data;
         struct obd_device *obd = ocus->ocus_current_obd;
         struct lustre_handle conn;
         int err = 0, len = 0, serialised = 0;
@@ -220,7 +224,9 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
 
         if (!obd && cmd != OBD_IOC_DEVICE && cmd != TCGETS &&
             cmd != OBD_IOC_LIST && cmd != OBD_GET_VERSION &&
-            cmd != OBD_IOC_NAME2DEV && cmd != OBD_IOC_NEWDEV) {
+            cmd != OBD_IOC_NAME2DEV && cmd != OBD_IOC_NEWDEV &&
+            cmd != OBD_IOC_ADD_UUID && cmd != OBD_IOC_DEL_UUID  &&
+            cmd != OBD_IOC_CLOSE_UUID) {
                 CERROR("OBD ioctl: No device\n");
                 GOTO(out, err = -EINVAL);
         }
@@ -270,7 +276,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                                 status = "-";
                         l = snprintf(buf2, remains, "%2d %s %s %s %s %d\n",
                                      i, status, obd->obd_type->typ_name,
-                                     obd->obd_name, obd->obd_uuid.uuid, obd->obd_type->typ_refcnt);
+                                     obd->obd_name, obd->obd_uuid.uuid,
+                                     obd->obd_type->typ_refcnt);
                         buf2 +=l;
                         remains -=l;
                         if (remains <= 0) {
@@ -397,7 +404,7 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
 
         case OBD_IOC_ATTACH: {
                 struct obd_type *type;
-                int minor;
+                int minor, len;
 
                 /* have we attached a type to this device */
                 if (obd->obd_flags & OBD_ATTACHED || obd->obd_type) {
@@ -414,7 +421,10 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                         CERROR("Type not nul terminated!\n");
                         GOTO(out, err = -EINVAL);
                 }
-
+                if (!data->ioc_inllen2 || !data->ioc_inlbuf2) {
+                        CERROR("No name passed!\n");
+                        GOTO(out, err = -EINVAL);
+                }
                 CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
                        MKSTR(data->ioc_inlbuf1),
                        MKSTR(data->ioc_inlbuf2), MKSTR(data->ioc_inlbuf3));
@@ -434,18 +444,22 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                 INIT_LIST_HEAD(&obd->obd_imports);
                 spin_lock_init(&obd->obd_dev_lock);
 
-                if (data->ioc_inlbuf2) {
-                        int len = strlen(data->ioc_inlbuf2) + 1;
-                        OBD_ALLOC(obd->obd_name, len);
-                        if (!obd->obd_name) {
-                                class_put_type(obd->obd_type);
-                                obd->obd_type = NULL;
-                                GOTO(out, err = -ENOMEM);
-                        }
-                        memcpy(obd->obd_name, data->ioc_inlbuf2, len);
-                } else {
-                        CERROR("WARNING: unnamed obd device\n");
+                /* XXX belong ins setup not attach  */
+                /* recovery data */
+                spin_lock_init(&obd->obd_processing_task_lock);
+                init_waitqueue_head(&obd->obd_next_transno_waitq);
+                INIT_LIST_HEAD(&obd->obd_recovery_queue);
+                INIT_LIST_HEAD(&obd->obd_delayed_reply_queue);
+
+                len = strlen(data->ioc_inlbuf2) + 1;
+                OBD_ALLOC(obd->obd_name, len);
+                if (!obd->obd_name) {
+                        class_put_type(obd->obd_type);
+                        obd->obd_type = NULL;
+                        GOTO(out, err = -ENOMEM);
                 }
+                memcpy(obd->obd_name, data->ioc_inlbuf2, len);
+
                 if (data->ioc_inlbuf3) {
                         int len = strlen(data->ioc_inlbuf3);
                         if (len >= sizeof(obd->obd_uuid)) {
@@ -465,7 +479,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                         err = OBP(obd,attach)(obd, sizeof(*data), data);
                 if (err) {
                         if(data->ioc_inlbuf2)
-                                OBD_FREE(obd->obd_name, strlen(obd->obd_name)+1);
+                                OBD_FREE(obd->obd_name,
+                                         strlen(obd->obd_name) + 1);
                         class_put_type(obd->obd_type);
                         obd->obd_type = NULL;
                 } else {
@@ -597,10 +612,34 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                 GOTO(out, err = 0);
         }
 
-        default:
-                obd_data2conn(&conn, data);
+        case OBD_IOC_CLOSE_UUID: {
+                struct lustre_peer peer;
+                CDEBUG(D_IOCTL, "closing all connections to uuid %s\n",
+                       data->ioc_inlbuf1);
+                lustre_uuid_to_peer(data->ioc_inlbuf1, &peer);
+                GOTO(out, err = 0);
+        }
+        case OBD_IOC_ADD_UUID: {
+                CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64
+                       ", nal %d\n", data->ioc_inlbuf1, data->ioc_nid,
+                       data->ioc_nal);
+
+                err = class_add_uuid(data->ioc_inlbuf1, data->ioc_nid,
+                                     data->ioc_nal);
+                GOTO(out, err);
+        }
+        case OBD_IOC_DEL_UUID: {
+                CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+                       data->ioc_inlbuf1 == NULL ? "<all uuids>" :
+                       data->ioc_inlbuf1);
 
-                err = obd_iocontrol(cmd, &conn, len, data, NULL);
+                err = class_del_uuid(data->ioc_inlbuf1);
+                GOTO(out, err);
+        }
+        default: { 
+                // obd_data2conn(&conn, data);
+                struct obd_class_user_conn *oconn = list_entry(ocus->ocus_conns.next, struct obd_class_user_conn, ocuc_chain);
+                err = obd_iocontrol(cmd, &oconn->ocuc_conn, len, data, NULL);
                 if (err)
                         GOTO(out, err);
 
@@ -609,6 +648,7 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                         err = -EFAULT;
                 GOTO(out, err);
         }
+        }
 
  out:
         if (buf)
@@ -620,6 +660,15 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
 
 
 
+#define OBD_MINOR 241
+#ifdef __KERNEL__
+/* to control /dev/obd */
+static int obd_class_ioctl (struct inode * inode, struct file * filp,
+                     unsigned int cmd, unsigned long arg)
+{
+        return class_handle_ioctl(filp->private_data, cmd, arg);
+}
+
 /* declare character device */
 static struct file_operations obd_psdev_fops = {
         ioctl: obd_class_ioctl,      /* ioctl */
@@ -628,12 +677,14 @@ static struct file_operations obd_psdev_fops = {
 };
 
 /* modules setup */
-#define OBD_MINOR 241
 static struct miscdevice obd_psdev = {
         OBD_MINOR,
         "obd_psdev",
         &obd_psdev_fops
 };
+#else
+void *obd_psdev = NULL;
+#endif
 
 void (*class_signal_connection_failure)(struct ptlrpc_connection *);
 
@@ -731,10 +782,19 @@ EXPORT_SYMBOL(class_conn2ldlmimp);
 EXPORT_SYMBOL(class_disconnect);
 EXPORT_SYMBOL(class_disconnect_all);
 EXPORT_SYMBOL(class_uuid_unparse);
+EXPORT_SYMBOL(lustre_uuid_to_peer);
 
 EXPORT_SYMBOL(class_signal_connection_failure);
 
+EXPORT_SYMBOL(class_handle_hash);
+EXPORT_SYMBOL(class_handle_unhash);
+EXPORT_SYMBOL(class_handle2object);
+
+#ifdef __KERNEL__
 static int __init init_obdclass(void)
+#else
+int init_obdclass(void)
+#endif
 {
         struct obd_device *obd;
         int err;
@@ -743,6 +803,9 @@ static int __init init_obdclass(void)
         printk(KERN_INFO "OBD class driver Build Version: " BUILD_VERSION
                       ", info@clusterfs.com\n");
 
+        class_init_uuidlist();
+        class_handle_init();
+
         sema_init(&obd_conf_sem, 1);
         INIT_LIST_HEAD(&obd_types);
 
@@ -759,7 +822,9 @@ static int __init init_obdclass(void)
         if (err)
                 return err;
 
+#ifdef __KERNEL__
         obd_sysctl_init();
+#endif
 
 #ifdef LPROCFS
         proc_lustre_root = proc_mkdir("lustre", proc_root_fs);
@@ -771,7 +836,11 @@ static int __init init_obdclass(void)
         return 0;
 }
 
+#ifdef __KERNEL__
 static void __exit cleanup_obdclass(void)
+#else
+static void cleanup_obdclass(void)
+#endif
 {
         int i;
         ENTRY;
@@ -787,13 +856,17 @@ static void __exit cleanup_obdclass(void)
         }
 
         obd_cleanup_caches();
+#ifdef __KERNEL__
         obd_sysctl_clean();
-
+#endif
         if (proc_lustre_root) {
                 lprocfs_remove(proc_lustre_root);
                 proc_lustre_root = NULL;
         }
 
+        class_handle_cleanup();
+        class_exit_uuidlist();
+
         CERROR("obd mem max: %d leaked: %d\n", obd_memmax,
                atomic_read(&obd_memory));
         EXIT;
@@ -801,17 +874,23 @@ static void __exit cleanup_obdclass(void)
 
 /* Check that we're building against the appropriate version of the Lustre
  * kernel patch */
+#ifdef __KERNEL__
 #include <linux/lustre_version.h>
-#define LUSTRE_SOURCE_VERSION 10
+#define LUSTRE_SOURCE_VERSION 13
 #if (LUSTRE_KERNEL_VERSION < LUSTRE_SOURCE_VERSION)
 # error Cannot continue: Your Lustre kernel patch is older than the sources
-#elif (LUSTRE_KERNEL_VERSION > 11)
+#elif (LUSTRE_KERNEL_VERSION > LUSTRE_SOURCE_VERSION)
 # error Cannot continue: Your Lustre sources are older than the kernel patch
 #endif
+#else
+#warning "Lib Lustre - no versioning information"
+#endif
 
+#ifdef __KERNEL__
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
 MODULE_LICENSE("GPL");
 
 module_init(init_obdclass);
 module_exit(cleanup_obdclass);
+#endif
index 8b28706..6118084 100644 (file)
 #define DEBUG_SUBSYSTEM D_OTHER
 
 #define EXPORT_SYMTAB
+#ifndef __KERNEL__
+#include <liblustre.h>
+#endif
+
 #include <linux/obd_ost.h>
+#include <linux/obd_support.h>
 #include <linux/lustre_debug.h>
 #include <linux/lustre_net.h>
 
index 5c52b43..72f2830 100644 (file)
@@ -23,7 +23,7 @@
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#error "FIXME: this needs to be updated to match fsfilt_extN.c"
+//#error "FIXME: this needs to be updated to match fsfilt_extN.c"
 
 #define DEBUG_SUBSYSTEM S_FILTER
 
 #include <linux/init.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
-#include <linux/ext3_xattr.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+# include <linux/ext3_xattr.h>
+#else
+# include <asm/statfs.h>
+#endif
 #include <linux/kp30.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/obd.h>
 #include <linux/module.h>
 
 static kmem_cache_t *fcb_cache;
-static int fcb_cache_count;
+static atomic_t fcb_cache_count = ATOMIC_INIT(0);
 
 struct fsfilt_cb_data {
         struct journal_callback cb_jcb; /* data private to jbd */
@@ -206,7 +211,7 @@ static void fsfilt_ext3_cb_func(struct journal_callback *jcb, int error)
         fcb->cb_func(fcb->cb_obd, fcb->cb_last_rcvd, error);
 
         kmem_cache_free(fcb_cache, fcb);
-        --fcb_cache_count;
+        atomic_dec(&fcb_cache_count);
 }
 
 static int fsfilt_ext3_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
@@ -219,7 +224,7 @@ static int fsfilt_ext3_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
         if (!fcb)
                 RETURN(-ENOMEM);
 
-        ++fcb_cache_count;
+        atomic_inc(&fcb_cache_count);
         fcb->cb_func = cb_func;
         fcb->cb_obd = obd;
         fcb->cb_last_rcvd = last_rcvd;
@@ -304,7 +309,7 @@ static int __init fsfilt_ext3_init(void)
                 GOTO(out, rc = -ENOMEM);
         }
 
-        rc = fsfilt_register_ops(&fsfilt_ext3_fs_ops);
+        rc = fsfilt_register_ops(&fsfilt_ext3_ops);
 
         if (rc)
                 kmem_cache_destroy(fcb_cache);
@@ -316,12 +321,12 @@ static void __exit fsfilt_ext3_exit(void)
 {
         int rc;
 
-        fsfilt_unregister_ops(&fsfilt_ext3_fs_ops);
+        fsfilt_unregister_ops(&fsfilt_ext3_ops);
         rc = kmem_cache_destroy(fcb_cache);
 
-        if (rc || fcb_cache_count) {
+        if (rc || atomic_read(&fcb_cache_count)) {
                 CERROR("can't free fsfilt callback cache: count %d, rc = %d\n",
-                       fcb_cache_count, rc);
+                       atomic_read(&fcb_cache_count), rc);
         }
 
         //rc = ext3_xattr_unregister();
index 0984c66..d029785 100644 (file)
@@ -40,7 +40,7 @@
 #include <linux/module.h>
 
 static kmem_cache_t *fcb_cache;
-static int fcb_cache_count;
+static atomic_t fcb_cache_count = ATOMIC_INIT(0);
 
 struct fsfilt_cb_data {
         struct journal_callback cb_jcb; /* data private to jbd */
@@ -418,7 +418,7 @@ static void fsfilt_extN_cb_func(struct journal_callback *jcb, int error)
         fcb->cb_func(fcb->cb_obd, fcb->cb_last_rcvd, error);
 
         kmem_cache_free(fcb_cache, fcb);
-        --fcb_cache_count;
+        atomic_dec(&fcb_cache_count);
 }
 
 static int fsfilt_extN_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
@@ -430,7 +430,7 @@ static int fsfilt_extN_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
         if (!fcb)
                 RETURN(-ENOMEM);
 
-        ++fcb_cache_count;
+        atomic_inc(&fcb_cache_count);
         fcb->cb_func = cb_func;
         fcb->cb_obd = obd;
         fcb->cb_last_rcvd = last_rcvd;
@@ -478,6 +478,14 @@ static int fsfilt_extN_sync(struct super_block *sb)
         return extN_force_commit(sb);
 }
 
+extern int extN_prep_san_write(struct inode *inode, long *blocks,
+                              int nblocks, loff_t newsize);
+static int fsfilt_extN_prep_san_write(struct inode *inode, long *blocks,
+                                      int nblocks, loff_t newsize)
+{
+        return extN_prep_san_write(inode, blocks, nblocks, newsize);
+}
+
 static struct fsfilt_operations fsfilt_extN_ops = {
         fs_type:                "extN",
         fs_owner:               THIS_MODULE,
@@ -492,6 +500,7 @@ static struct fsfilt_operations fsfilt_extN_ops = {
         fs_set_last_rcvd:       fsfilt_extN_set_last_rcvd,
         fs_statfs:              fsfilt_extN_statfs,
         fs_sync:                fsfilt_extN_sync,
+        fs_prep_san_write:      fsfilt_extN_prep_san_write,
 };
 
 static int __init fsfilt_extN_init(void)
@@ -522,9 +531,9 @@ static void __exit fsfilt_extN_exit(void)
         fsfilt_unregister_ops(&fsfilt_extN_ops);
         rc = kmem_cache_destroy(fcb_cache);
 
-        if (rc || fcb_cache_count) {
+        if (rc || atomic_read(&fcb_cache_count)) {
                 CERROR("can't free fsfilt callback cache: count %d, rc = %d\n",
-                       fcb_cache_count, rc);
+                       atomic_read(&fcb_cache_count), rc);
         }
 
         //rc = extN_xattr_unregister();
index f8d4ac3..06302c5 100644 (file)
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#include <linux/init.h>
+#include <asm/statfs.h>
+#endif
 #include <linux/kp30.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/obd.h>
index e5be2bc..6fcf504 100644 (file)
  */
 
 #define DEBUG_SUBSYSTEM S_CLASS
+#ifdef __KERNEL__
 #include <linux/kmod.h>   /* for request_module() */
 #include <linux/module.h>
 #include <linux/obd_class.h>
 #include <linux/random.h>
 #include <linux/slab.h>
+#else 
+#include <liblustre.h>
+#include <linux/obd_class.h>
+#include <linux/obd.h>
+#endif
 #include <linux/lprocfs_status.h>
 
 extern struct list_head obd_types;
@@ -115,7 +121,7 @@ int class_register_type(struct obd_ops *ops, struct lprocfs_vars *vars,
 
         type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
                                               vars, type);
-        if (IS_ERR(type->typ_procroot)) {
+        if (type->typ_procroot && IS_ERR(type->typ_procroot)) {
                 rc = PTR_ERR(type->typ_procroot);
                 type->typ_procroot = NULL;
                 list_del(&type->typ_chain);
@@ -344,10 +350,11 @@ struct obd_export *class_new_export(struct obd_device *obddev)
 
 void class_destroy_export(struct obd_export *exp)
 {
-        ENTRY;
-
         LASSERT(exp->exp_cookie != DEAD_HANDLE_MAGIC);
 
+        CDEBUG(D_IOCTL, "destroying export %p/%s\n", exp,
+               exp->exp_client_uuid.uuid);
+
         spin_lock(&exp->exp_obd->obd_dev_lock);
         list_del(&exp->exp_obd_chain);
         spin_unlock(&exp->exp_obd->obd_dev_lock);
@@ -369,17 +376,15 @@ void class_destroy_export(struct obd_export *exp)
 
         exp->exp_cookie = DEAD_HANDLE_MAGIC;
         kmem_cache_free(export_cachep, exp);
-
-        EXIT;
 }
 
 /* a connection defines an export context in which preallocation can
    be managed. */
-int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+int class_connect(struct lustre_handle *exporth, struct obd_device *obd,
                   struct obd_uuid *cluuid)
 {
         struct obd_export * export;
-        if (conn == NULL) {
+        if (exporth == NULL) {
                 LBUG();
                 return -EINVAL;
         }
@@ -398,12 +403,12 @@ int class_connect(struct lustre_handle *conn, struct obd_device *obd,
         if (!export)
                 return -ENOMEM;
 
-        conn->addr = (__u64) (unsigned long)export;
-        conn->cookie = export->exp_cookie;
+        exporth->addr = (__u64) (unsigned long)export;
+        exporth->cookie = export->exp_cookie;
         memcpy(&export->exp_client_uuid, cluuid, sizeof(export->exp_client_uuid));
 
         CDEBUG(D_IOCTL, "connect: addr %Lx cookie %Lx\n",
-               (long long)conn->addr, (long long)conn->cookie);
+               (long long)exporth->addr, (long long)exporth->cookie);
         return 0;
 }
 
index d4be2d6..26bbdf7 100644 (file)
  */
 
 #define EXPORT_SYMTAB
+#define DEBUG_SUBSYSTEM S_CLASS
+#ifdef __KERNEL__
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/version.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
+#endif
+
+#else
+#include <liblustre.h>
+#endif
 
-#define DEBUG_SUBSYSTEM S_CLASS
 #include <linux/obd_class.h>
 #include <linux/lprocfs_status.h>
 
@@ -110,7 +118,11 @@ void lprocfs_remove(struct proc_dir_entry* root)
 {
         struct proc_dir_entry *temp = root;
         struct proc_dir_entry *rm_entry;
-        struct proc_dir_entry *parent = root->parent;
+        struct proc_dir_entry *parent;
+
+        LASSERT(root != NULL);
+        parent = root->parent;
+        LASSERT(parent != NULL);
 
         while (1) {
                 while (temp->subdir)
@@ -133,8 +145,8 @@ struct proc_dir_entry *lprocfs_register(const char *name,
         newchild = lprocfs_srch(parent, name);
         if (newchild) {
                 CERROR(" Lproc: Attempting to register %s more than once \n",
-                                name);
-                return NULL;
+                       name);
+                return ERR_PTR(-EALREADY);
         }
 
         newchild = proc_mkdir(name, parent);
@@ -153,6 +165,7 @@ struct proc_dir_entry *lprocfs_register(const char *name,
 int lprocfs_rd_u64(char *page, char **start, off_t off,
                    int count, int *eof, void *data)
 {
+        LASSERT(data != NULL);
         *eof = 1;
         return snprintf(page, count, LPU64"\n", *(__u64 *)data);
 }
@@ -162,6 +175,7 @@ int lprocfs_rd_uuid(char* page, char **start, off_t off, int count,
 {
         struct obd_device* dev = (struct obd_device*)data;
 
+        LASSERT(dev != NULL);
         *eof = 1;
         return snprintf(page, count, "%s\n", dev->obd_uuid.uuid);
 }
@@ -171,6 +185,8 @@ int lprocfs_rd_name(char *page, char **start, off_t off, int count,
 {
         struct obd_device* dev = (struct obd_device *)data;
 
+        LASSERT(dev != NULL);
+        LASSERT(dev->obd_name != NULL);
         *eof = 1;
         return snprintf(page, count, "%s\n", dev->obd_name);
 }
@@ -178,16 +194,20 @@ int lprocfs_rd_name(char *page, char **start, off_t off, int count,
 int lprocfs_rd_blksize(char* page, char **start, off_t off, int count,
                        int *eof, struct statfs *sfs)
 {
+        LASSERT(sfs != NULL);
         *eof = 1;
-
         return snprintf(page, count, "%lu\n", sfs->f_bsize);
 }
 
 int lprocfs_rd_kbytestotal(char* page, char **start, off_t off, int count,
                            int *eof, struct statfs *sfs)
 {
-        __u32 blk_size = sfs->f_bsize >> 10;
-        __u64 result = sfs->f_blocks;
+        __u32 blk_size;
+        __u64 result;
+
+        LASSERT(sfs != NULL);
+        blk_size = sfs->f_bsize >> 10;
+        result = sfs->f_blocks;
 
         while (blk_size >>= 1)
                 result <<= 1;
@@ -199,8 +219,12 @@ int lprocfs_rd_kbytestotal(char* page, char **start, off_t off, int count,
 int lprocfs_rd_kbytesfree(char* page, char **start, off_t off, int count,
                           int *eof, struct statfs *sfs)
 {
-        __u32 blk_size = sfs->f_bsize >> 10;
-        __u64 result = sfs->f_bfree;
+        __u32 blk_size;
+        __u64 result;
+
+        LASSERT(sfs != NULL);
+        blk_size = sfs->f_bsize >> 10;
+        result = sfs->f_bfree;
 
         while (blk_size >>= 1)
                 result <<= 1;
@@ -212,6 +236,7 @@ int lprocfs_rd_kbytesfree(char* page, char **start, off_t off, int count,
 int lprocfs_rd_filestotal(char* page, char **start, off_t off, int count,
                           int *eof, struct statfs *sfs)
 {
+        LASSERT(sfs != NULL);
         *eof = 1;
         return snprintf(page, count, "%ld\n", sfs->f_files);
 }
@@ -219,6 +244,7 @@ int lprocfs_rd_filestotal(char* page, char **start, off_t off, int count,
 int lprocfs_rd_filesfree(char* page, char **start, off_t off, int count,
                          int *eof, struct statfs *sfs)
 {
+        LASSERT(sfs != NULL);
         *eof = 1;
         return snprintf(page, count, "%ld\n", sfs->f_ffree);
 }
@@ -234,7 +260,11 @@ int lprocfs_rd_server_uuid(char* page, char **start, off_t off, int count,
                            int *eof, void *data)
 {
         struct obd_device* obd = (struct obd_device*)data;
-        struct client_obd* cli = &obd->u.cli;
+        struct client_obd* cli;
+
+        LASSERT(obd != NULL);
+        cli = &obd->u.cli;
+        *eof = 1;
         return snprintf(page, count, "%s\n", cli->cl_target_uuid.uuid);
 }
 
@@ -242,8 +272,11 @@ int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count,
                          int *eof,  void *data)
 {
         struct obd_device *obd = (struct obd_device*)data;
-        struct ptlrpc_connection *conn = obd->u.cli.cl_import.imp_connection;
+        struct ptlrpc_connection *conn;
 
+        LASSERT(obd != NULL);
+        conn = obd->u.cli.cl_import.imp_connection;
+        LASSERT(conn != NULL);
         *eof = 1;
         return snprintf(page, count, "%s\n", conn->c_remote_uuid.uuid);
 }
@@ -253,6 +286,7 @@ int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count,
 {
         struct obd_type* class = (struct obd_type*) data;
 
+        LASSERT(class != NULL);
         *eof = 1;
         return snprintf(page, count, "%d\n", class->typ_refcnt);
 }
@@ -260,12 +294,17 @@ int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count,
 int lprocfs_obd_attach(struct obd_device *dev, struct lprocfs_vars *list)
 {
         int rc = 0;
+
+        LASSERT(dev != NULL);
+        LASSERT(dev->obd_type != NULL);
+        LASSERT(dev->obd_type->typ_procroot != NULL);
+
         dev->obd_proc_entry = lprocfs_register(dev->obd_name,
                                                dev->obd_type->typ_procroot,
                                                list, dev);
         if (IS_ERR(dev->obd_proc_entry)) {
                 rc = PTR_ERR(dev->obd_proc_entry);
-               dev->obd_proc_entry = NULL;
+                dev->obd_proc_entry = NULL;
         }
         return rc;
 }
diff --git a/lustre/obdclass/lustre_handles.c b/lustre/obdclass/lustre_handles.c
new file mode 100644 (file)
index 0000000..01dd75b
--- /dev/null
@@ -0,0 +1,166 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <linux/random.h>
+#else 
+#include <liblustre.h>
+#endif 
+
+
+#include <linux/kp30.h>
+#include <linux/lustre_handles.h>
+
+static spinlock_t handle_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t random_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head *handle_hash = NULL;
+static int handle_count = 0;
+
+#define HANDLE_HASH_SIZE (1 << 14)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+void class_handle_hash(struct portals_handle *h, portals_handle_addref_cb cb)
+{
+        struct list_head *bucket;
+        ENTRY;
+
+        LASSERT(h != NULL);
+        LASSERT(list_empty(&h->h_link));
+
+        /* My hypothesis is that get_random_bytes, if called from two threads at
+         * the same time, will return the same bytes. -phil */
+        spin_lock(&random_lock);
+        get_random_bytes(&h->h_cookie, sizeof(h->h_cookie));
+        spin_unlock(&random_lock);
+
+        h->h_addref = cb;
+
+        bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+        CDEBUG(D_INFO, "adding object %p with handle "LPX64" to hash\n",
+               h, h->h_cookie);
+
+        spin_lock(&handle_lock);
+        list_add(&h->h_link, bucket);
+        handle_count++;
+        spin_unlock(&handle_lock);
+        EXIT;
+}
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+        LASSERT(!list_empty(&h->h_link));
+
+        CDEBUG(D_INFO, "removing object %p with handle "LPX64" from hash\n",
+               h, h->h_cookie);
+
+        handle_count--;
+        list_del_init(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+        spin_lock(&handle_lock);
+        class_handle_unhash_nolock(h);
+        spin_unlock(&handle_lock);
+}
+
+void *class_handle2object(__u64 cookie)
+{
+        struct list_head *bucket, *tmp;
+        void *retval = NULL;
+        ENTRY;
+
+        LASSERT(handle_hash != NULL);
+
+        spin_lock(&handle_lock);
+        bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+        list_for_each(tmp, bucket) {
+                struct portals_handle *h;
+                h = list_entry(tmp, struct portals_handle, h_link);
+
+                if (h->h_cookie == cookie) {
+                        h->h_addref(h);
+                        retval = h;
+                        break;
+                }
+        }
+        spin_unlock(&handle_lock);
+
+        RETURN(retval);
+}
+
+int class_handle_init(void)
+{
+        struct list_head *bucket;
+
+        LASSERT(handle_hash == NULL);
+
+        PORTAL_ALLOC(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+        if (handle_hash == NULL)
+                return -ENOMEM;
+
+        for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+             bucket--)
+                INIT_LIST_HEAD(bucket);
+
+        return 0;
+}
+
+static void cleanup_all_handles(void)
+{
+        int i;
+
+        spin_lock(&handle_lock);
+        for (i = 0; i < HANDLE_HASH_SIZE; i++) {
+                struct list_head *tmp, *pos;
+                list_for_each_safe(tmp, pos, &(handle_hash[i])) {
+                        struct portals_handle *h;
+                        h = list_entry(tmp, struct portals_handle, h_link);
+
+                        CERROR("forcing cleanup for handle "LPX64"\n",
+                               h->h_cookie);
+
+                        class_handle_unhash_nolock(h);
+                }
+        }
+        spin_lock(&handle_lock);
+}
+
+void class_handle_cleanup(void)
+{
+        LASSERT(handle_hash != NULL);
+
+        if (handle_count != 0) {
+                CERROR("handle_count at cleanup: %d\n", handle_count);
+                cleanup_all_handles();
+        }
+
+        PORTAL_FREE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+        handle_hash = NULL;
+
+        if (handle_count)
+                CERROR("leaked %d handles\n", handle_count);
+}
diff --git a/lustre/obdclass/lustre_peer.c b/lustre/obdclass/lustre_peer.c
new file mode 100644 (file)
index 0000000..016354c
--- /dev/null
@@ -0,0 +1,179 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#ifdef __KERNEL__
+# include <linux/module.h>
+# include <linux/init.h>
+# include <linux/list.h>
+#else
+# include <liblustre.h>
+#endif
+#include <linux/obd.h>
+#include <linux/obd_support.h>
+#include <linux/obd_class.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_ha.h>
+#include <linux/lustre_net.h>
+#include <linux/lprocfs_status.h>
+
+struct uuid_nid_data {
+        struct list_head head;
+        ptl_nid_t nid;
+        char *uuid;
+        __u32 nal;
+        ptl_handle_ni_t ni;
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static struct list_head g_uuid_list;
+static spinlock_t       g_uuid_lock;
+
+void class_init_uuidlist(void)
+{
+        INIT_LIST_HEAD(&g_uuid_list);
+        spin_lock_init(&g_uuid_lock);
+}
+
+void class_exit_uuidlist(void)
+{
+        struct list_head *tmp, *n;
+
+        /* Module going => sole user => don't need to lock g_uuid_list */
+        list_for_each_safe(tmp, n, &g_uuid_list) {
+                struct uuid_nid_data *data =
+                        list_entry(tmp, struct uuid_nid_data, head);
+
+                PORTAL_FREE(data->uuid, strlen(data->uuid) + 1);
+                PORTAL_FREE(data, sizeof(*data));
+        }
+}
+
+int lustre_uuid_to_peer(char *uuid, struct lustre_peer *peer)
+{
+        struct list_head *tmp;
+
+        spin_lock (&g_uuid_lock);
+
+        list_for_each(tmp, &g_uuid_list) {
+                struct uuid_nid_data *data =
+                        list_entry(tmp, struct uuid_nid_data, head);
+
+                if (strcmp(data->uuid, uuid) == 0) {
+                        peer->peer_nid = data->nid;
+                        peer->peer_ni = data->ni;
+
+                        spin_unlock (&g_uuid_lock);
+                        return 0;
+                }
+        }
+
+        spin_unlock (&g_uuid_lock);
+        return -1;
+}
+
+int class_add_uuid(char *uuid, __u64 nid, __u32 nal)
+{
+        const ptl_handle_ni_t *nip;
+        struct uuid_nid_data *data;
+        int rc;
+        int nob = strnlen (uuid, PAGE_SIZE) + 1;
+
+        if (nob > PAGE_SIZE)
+                return -EINVAL;
+
+        nip = kportal_get_ni (nal);
+        if (nip == NULL) {
+                CERROR("get_ni failed: is the NAL module loaded?\n");
+                return -EIO;
+        }
+
+        rc = -ENOMEM;
+        PORTAL_ALLOC(data, sizeof(*data));
+        if (data == NULL)
+                goto fail_0;
+
+        PORTAL_ALLOC(data->uuid, nob);
+        if (data == NULL)
+                goto fail_1;
+
+        memcpy(data->uuid, uuid, nob);
+        data->nid = nid;
+        data->nal = nal;
+        data->ni  = *nip;
+
+        spin_lock (&g_uuid_lock);
+
+        list_add(&data->head, &g_uuid_list);
+
+        spin_unlock (&g_uuid_lock);
+
+        return 0;
+
+ fail_1:
+        PORTAL_FREE (data, sizeof (*data));
+ fail_0:
+        kportal_put_ni (nal);
+        return (rc);
+}
+
+/* delete only one entry if uuid is specified, otherwise delete all */
+int class_del_uuid (char *uuid)
+{
+        struct list_head  deathrow;
+        struct list_head *tmp;
+        struct list_head *n;
+        struct uuid_nid_data *data;
+
+        INIT_LIST_HEAD (&deathrow);
+
+        spin_lock (&g_uuid_lock);
+
+        list_for_each_safe(tmp, n, &g_uuid_list) {
+                data = list_entry(tmp, struct uuid_nid_data, head);
+
+                if (uuid == NULL || strcmp(data->uuid, uuid) == 0) {
+                        list_del (&data->head);
+                        list_add (&data->head, &deathrow);
+                        if (uuid)
+                                break;
+                }
+        }
+
+        spin_unlock (&g_uuid_lock);
+
+        if (list_empty (&deathrow))
+                return -EINVAL;
+
+        do {
+                data = list_entry(deathrow.next, struct uuid_nid_data, head);
+
+                list_del (&data->head);
+
+                kportal_put_ni (data->nal);
+                PORTAL_FREE(data->uuid, strlen(data->uuid) + 1);
+                PORTAL_FREE(data, sizeof(*data));
+        } while (!list_empty (&deathrow));
+
+        return 0;
+}
index 4efffa5..1998ba3 100644 (file)
 #define DEBUG_SUBSYSTEM S_CLASS
 
 #define EXPORT_SYMTAB
+#ifndef __KERNEL__
+#include <liblustre.h>
+#endif
+
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
+#endif
+
+#include <linux/lustre_export.h>
 #include <linux/lustre_net.h>
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
index d1388d6..125f392 100644 (file)
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/sysctl.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #include <linux/swapctl.h>
+#endif
 #include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/stat.h>
index 0e279fb..fed9a8f 100644 (file)
  * Library General Public License.
  * %End-Header%
  */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#ifdef __KERNEL__
 #include <linux/ctype.h>
 #include <linux/kernel.h>
-
-#define DEBUG_SUBSYSTEM S_CLASS
+#else 
+#include <liblustre.h>
+#endif
 
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
index ad0e0ff..f8ed503 100644 (file)
@@ -4,12 +4,17 @@
 # See the file COPYING in this distribution
 
 DEFS= 
+
+if LIBLUSTRE
+lib_LIBRARIES = libobdecho.a
+libobdecho_a_SOURCES = echo_client.c 
+else
 MODULE = obdecho
 modulefs_DATA = obdecho.o
 EXTRA_PROGRAMS = obdecho
-
 LINX=
 obdecho_SOURCES = echo.c echo_client.c lproc_echo.c $(LINX)
+endif
 
 include $(top_srcdir)/Rules
 
index 281166e..1796957 100644 (file)
@@ -63,6 +63,7 @@ struct xprocfs_io_stat {
         __u64    st_create_reqs;
         __u64    st_destroy_reqs;
         __u64    st_statfs_reqs;
+        __u64    st_sync_reqs;
         __u64    st_open_reqs;
         __u64    st_close_reqs;
         __u64    st_punch_reqs;
@@ -87,7 +88,7 @@ xprocfs_sum_##field (void)                              \
                 stat += xprocfs_iostats[i].field;       \
         return (stat);                                  \
 }
-
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 DECLARE_XPROCFS_SUM_STAT (st_read_bytes)
 DECLARE_XPROCFS_SUM_STAT (st_read_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_write_bytes)
@@ -97,9 +98,11 @@ DECLARE_XPROCFS_SUM_STAT (st_setattr_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_create_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_destroy_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_statfs_reqs)
+DECLARE_XPROCFS_SUM_STAT (st_sync_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_open_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_close_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_punch_reqs)
+#endif
 
 static int
 xprocfs_rd_stat (char *page, char **start, off_t off, int count,
@@ -107,7 +110,7 @@ xprocfs_rd_stat (char *page, char **start, off_t off, int count,
 {
         long long (*fn)(void) = (long long(*)(void))data;
         int         len;
-        
+
         *eof = 1;
         if (off != 0)
                 return (0);
@@ -116,7 +119,7 @@ xprocfs_rd_stat (char *page, char **start, off_t off, int count,
         *start = page;
         return (len);
 }
-        
+
 
 static void
 xprocfs_add_stat(char *name, long long (*fn)(void))
@@ -138,7 +141,7 @@ static void
 xprocfs_init (char *name)
 {
         char  dirname[64];
-        
+
         snprintf (dirname, sizeof (dirname), "sys/%s", name);
 
         xprocfs_dir = proc_mkdir (dirname, NULL);
@@ -147,6 +150,7 @@ xprocfs_init (char *name)
                 return;
         }
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         xprocfs_add_stat ("read_bytes",   xprocfs_sum_st_read_bytes);
         xprocfs_add_stat ("read_reqs",    xprocfs_sum_st_read_reqs);
         xprocfs_add_stat ("write_bytes",  xprocfs_sum_st_write_bytes);
@@ -156,9 +160,11 @@ xprocfs_init (char *name)
         xprocfs_add_stat ("create_reqs",  xprocfs_sum_st_create_reqs);
         xprocfs_add_stat ("destroy_reqs", xprocfs_sum_st_destroy_reqs);
         xprocfs_add_stat ("statfs_reqs",  xprocfs_sum_st_statfs_reqs);
+        xprocfs_add_stat ("sync_reqs",    xprocfs_sum_st_sync_reqs);
         xprocfs_add_stat ("open_reqs",    xprocfs_sum_st_open_reqs);
         xprocfs_add_stat ("close_reqs",   xprocfs_sum_st_close_reqs);
         xprocfs_add_stat ("punch_reqs",   xprocfs_sum_st_punch_reqs);
+#endif
 }
 
 void xprocfs_fini (void)
@@ -175,6 +181,7 @@ void xprocfs_fini (void)
         remove_proc_entry ("create_reqs",  xprocfs_dir);
         remove_proc_entry ("destroy_reqs", xprocfs_dir);
         remove_proc_entry ("statfs_reqs",  xprocfs_dir);
+        remove_proc_entry ("sync_reqs",    xprocfs_dir);
         remove_proc_entry ("open_reqs",    xprocfs_dir);
         remove_proc_entry ("close_reqs",   xprocfs_dir);
         remove_proc_entry ("punch_reqs",   xprocfs_dir);
@@ -193,9 +200,9 @@ static int echo_connect(struct lustre_handle *conn, struct obd_device *obd,
 static int echo_disconnect(struct lustre_handle *conn)
 {
         struct obd_export *exp = class_conn2export(conn);
-        
+
         LASSERT (exp != NULL);
-        
+
         ldlm_cancel_locks_for_export (exp);
         return (class_disconnect (conn));
 }
@@ -287,7 +294,7 @@ static int echo_open(struct lustre_handle *conn, struct obdo *oa,
 
         fh->addr = oa->o_id;
         fh->cookie = ECHO_HANDLE_MAGIC;
-        
+
         oa->o_valid |= OBD_MD_FLHANDLE;
         return 0;
 }
@@ -314,7 +321,7 @@ static int echo_close(struct lustre_handle *conn, struct obdo *oa,
                 CERROR ("invalid file handle on close: "LPX64"\n", fh->cookie);
                 return (-EINVAL);
         }
-        
+
         return 0;
 }
 
@@ -325,7 +332,7 @@ static int echo_getattr(struct lustre_handle *conn, struct obdo *oa,
         obd_id id = oa->o_id;
 
         XPROCFS_BUMP_MYCPU_IOSTAT (st_getattr_reqs, 1);
-        
+
         if (!obd) {
                 CERROR("invalid client "LPX64"\n", conn->addr);
                 RETURN(-EINVAL);
@@ -348,7 +355,7 @@ static int echo_setattr(struct lustre_handle *conn, struct obdo *oa,
         struct obd_device *obd = class_conn2obd(conn);
 
         XPROCFS_BUMP_MYCPU_IOSTAT (st_setattr_reqs, 1);
-        
+
         if (!obd) {
                 CERROR("invalid client "LPX64"\n", conn->addr);
                 RETURN(-EINVAL);
@@ -371,7 +378,8 @@ static int echo_setattr(struct lustre_handle *conn, struct obdo *oa,
 
 int echo_preprw(int cmd, struct lustre_handle *conn, int objcount,
                 struct obd_ioobj *obj, int niocount, struct niobuf_remote *nb,
-                struct niobuf_local *res, void **desc_private, struct obd_trans_info *oti)
+                struct niobuf_local *res, void **desc_private,
+                struct obd_trans_info *oti)
 {
         struct obd_device *obd;
         struct niobuf_local *r = res;
@@ -409,13 +417,15 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount,
 
                         if (isobj0 &&
                             (nb->offset >> PAGE_SHIFT) < ECHO_OBJECT0_NPAGES) {
-                                r->page = echo_object0_pages[nb->offset >> PAGE_SHIFT];
+                                r->page = echo_object0_pages[nb->offset >>
+                                                             PAGE_SHIFT];
                                 /* Take extra ref so __free_pages() can be called OK */
                                 get_page (r->page);
                         } else {
                                 r->page = alloc_pages(gfp_mask, 0);
                                 if (r->page == NULL) {
-                                        CERROR("can't get page %d/%d for id "LPU64"\n",
+                                        CERROR("can't get page %u/%u for id "
+                                               LPU64"\n",
                                                j, obj->ioo_bufcnt, obj->ioo_id);
                                         GOTO(preprw_cleanup, rc = -ENOMEM);
                                 }
@@ -431,12 +441,13 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount,
                                r->page, r->addr, r->offset);
 
                         if (cmd == OBD_BRW_READ) {
-                                XPROCFS_BUMP_MYCPU_IOSTAT (st_read_bytes, r->len);
+                                XPROCFS_BUMP_MYCPU_IOSTAT(st_read_bytes,r->len);
                                 if (verify)
-                                        page_debug_setup(r->addr, r->len, r->offset,
-                                                         obj->ioo_id);
+                                        page_debug_setup(r->addr, r->len,
+                                                         r->offset,obj->ioo_id);
                         } else {
-                                XPROCFS_BUMP_MYCPU_IOSTAT (st_write_bytes, r->len);
+                                XPROCFS_BUMP_MYCPU_IOSTAT(st_write_bytes,
+                                                          r->len);
                                 if (verify)
                                         page_debug_setup(r->addr, r->len,
                                                          0xecc0ecc0ecc0ecc0,
@@ -527,7 +538,7 @@ int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount,
                                 if (vrc != 0 && rc == 0)
                                         rc = vrc;
                         }
-                        
+
                         kunmap(page);
                         /* NB see comment above regarding object0 pages */
                         obd_kmap_put(1);
@@ -619,11 +630,11 @@ extern int echo_client_init(void);
 extern void echo_client_cleanup(void);
 
 static void
-echo_object0_pages_fini (void) 
+echo_object0_pages_fini (void)
 {
         int     i;
-        
-        for (i = 0; i < ECHO_OBJECT0_NPAGES; i++) 
+
+        for (i = 0; i < ECHO_OBJECT0_NPAGES; i++)
                 if (echo_object0_pages[i] != NULL) {
                         __free_pages (echo_object0_pages[i], 0);
                         echo_object0_pages[i] = NULL;
@@ -635,22 +646,23 @@ echo_object0_pages_init (void)
 {
         struct page *pg;
         int          i;
-        
+
         for (i = 0; i < ECHO_OBJECT0_NPAGES; i++) {
-                int gfp_mask = (i < ECHO_OBJECT0_NPAGES/2) ? GFP_KERNEL : GFP_HIGHUSER;
-                
+                int gfp_mask = (i < ECHO_OBJECT0_NPAGES/2) ?
+                        GFP_KERNEL : GFP_HIGHUSER;
+
                 pg = alloc_pages (gfp_mask, 0);
                 if (pg == NULL) {
                         echo_object0_pages_fini ();
                         return (-ENOMEM);
                 }
-                
+
                 memset (kmap (pg), 0, PAGE_SIZE);
                 kunmap (pg);
 
                 echo_object0_pages[i] = pg;
         }
-        
+
         return (0);
 }
 
@@ -668,7 +680,7 @@ static int __init obdecho_init(void)
         rc = echo_object0_pages_init ();
         if (rc != 0)
                 goto failed_0;
-        
+
         rc = class_register_type(&echo_obd_ops, lvars.module_vars,
                                  OBD_ECHO_DEVICENAME);
         if (rc != 0)
@@ -683,7 +695,7 @@ static int __init obdecho_init(void)
         echo_object0_pages_fini ();
  failed_0:
         xprocfs_fini ();
-        
+
         RETURN(rc);
 }
 
index 6c4eb6d..9f7544b 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define DEBUG_SUBSYSTEM S_ECHO
+#ifdef __KERNEL__
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #include <linux/iobuf.h>
+#endif
 #include <asm/div64.h>
+#else
+#include <liblustre.h>
+#endif
 
-#define DEBUG_SUBSYSTEM S_ECHO
-
+#include <linux/obd.h>
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
 #include <linux/obd_echo.h>
@@ -120,11 +126,11 @@ echo_copyin_lsm (struct obd_device *obd, struct lov_stripe_md *lsm,
 }
 
 static struct ec_object *
-echo_allocate_object (struct obd_device *obd) 
+echo_allocate_object (struct obd_device *obd)
 {
         struct echo_client_obd *ec = &obd->u.echo_client;
         struct ec_object       *eco;
-        
+
         OBD_ALLOC (eco, sizeof (*eco));
         if (eco == NULL)
                 return (NULL);
@@ -134,7 +140,7 @@ echo_allocate_object (struct obd_device *obd)
                 OBD_FREE (eco, sizeof (*eco));
                 return (NULL);
         }
-        
+
         eco->eco_device = obd;
         eco->eco_deleted = 0;
         eco->eco_refcount = 0;
@@ -145,7 +151,7 @@ echo_allocate_object (struct obd_device *obd)
 }
 
 static void
-echo_free_object (struct ec_object *eco) 
+echo_free_object (struct ec_object *eco)
 {
         struct obd_device      *obd = eco->eco_device;
         struct echo_client_obd *ec = &obd->u.echo_client;
@@ -165,7 +171,7 @@ echo_create_object (struct obd_device *obd, int on_target, struct obdo *oa,
         struct lov_stripe_md   *lsm;
         int                     rc;
         int                     i;
-        
+
         if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */
             (on_target ||                       /* set_stripe */
              ec->ec_nstripes != 0)) {           /* LOV */
@@ -176,7 +182,7 @@ echo_create_object (struct obd_device *obd, int on_target, struct obdo *oa,
         eco = echo_allocate_object (obd);
         if (eco == NULL)
                 return (-ENOMEM);
-        
+
         lsm = eco->eco_lsm;
 
         if (ulsm != NULL) {
@@ -184,11 +190,11 @@ echo_create_object (struct obd_device *obd, int on_target, struct obdo *oa,
                 if (rc != 0)
                         goto failed;
         }
-        
+
         /* setup object ID here for !on_target and LOV hint */
         if ((oa->o_valid & OBD_MD_FLID) != 0)
                 eco->eco_id = lsm->lsm_object_id = oa->o_id;
-        
+
         /* defaults -> actual values */
         if (lsm->lsm_stripe_offset == 0xffffffff)
                 lsm->lsm_stripe_offset = 0;
@@ -207,33 +213,33 @@ echo_create_object (struct obd_device *obd, int on_target, struct obdo *oa,
                 lsm->lsm_oinfo[i].loi_ost_idx =
                         (lsm->lsm_stripe_offset + i) % ec->ec_nstripes;
         }
-        
+
         if (on_target) {
                 rc = obd_create (&ec->ec_conn, oa, &lsm, NULL);
                 if (rc != 0)
                         goto failed;
-                
+
                 /* See what object ID we were given */
                 LASSERT ((oa->o_valid & OBD_MD_FLID) != 0);
                 eco->eco_id = lsm->lsm_object_id = oa->o_id;
         }
-        
+
         spin_lock (&ec->ec_lock);
 
         eco2 = echo_find_object_locked (obd, oa->o_id);
         if (eco2 != NULL) {                     /* conflict */
                 spin_unlock (&ec->ec_lock);
-                
-                CERROR ("Can't create object id "LPX64": id already exists%s\n", 
+
+                CERROR ("Can't create object id "LPX64": id already exists%s\n",
                         oa->o_id, on_target ? " (undoing create)" : "");
-                
+
                 if (on_target)
                         obd_destroy (&ec->ec_conn, oa, lsm, NULL);
-                
+
                 rc = -EEXIST;
                 goto failed;
         }
-        
+
         list_add (&eco->eco_obj_chain, &ec->ec_objects);
         spin_unlock (&ec->ec_lock);
         CDEBUG (D_INFO,
@@ -251,25 +257,26 @@ echo_create_object (struct obd_device *obd, int on_target, struct obdo *oa,
 }
 
 static int
-echo_get_object (struct ec_object **ecop, struct obd_device *obd, struct obdo *oa)
+echo_get_object (struct ec_object **ecop, struct obd_device *obd,
+                 struct obdo *oa)
 {
         struct echo_client_obd *ec = &obd->u.echo_client;
         struct ec_object       *eco;
         struct ec_object       *eco2;
         int                     rc;
 
-        if ((oa->o_valid & OBD_MD_FLID) == 0) 
+        if ((oa->o_valid & OBD_MD_FLID) == 0)
         {
                 CERROR ("No valid oid\n");
                 return (-EINVAL);
         }
-        
+
         spin_lock (&ec->ec_lock);
         eco = echo_find_object_locked (obd, oa->o_id);
         if (eco != NULL) {
                 if (eco->eco_deleted)           /* being deleted */
                         return (-EAGAIN);       /* (see comment in cleanup) */
-                
+
                 eco->eco_refcount++;
                 spin_unlock (&ec->ec_lock);
                 *ecop = eco;
@@ -328,7 +335,7 @@ echo_get_object (struct ec_object **ecop, struct obd_device *obd, struct obdo *o
         }
 
         spin_unlock (&ec->ec_lock);
-        
+
         echo_free_object (eco);
         return (rc);
 }
@@ -361,12 +368,12 @@ echo_put_object (struct ec_object *eco)
          * sure there will be no more lock callbacks.
          */
         obd_cancel_unused (&ec->ec_conn, eco->eco_lsm, 0);
-        
+
         /* now we can let it go */
         spin_lock (&ec->ec_lock);
         list_del (&eco->eco_obj_chain);
         spin_unlock (&ec->ec_lock);
-        
+
         LASSERT (eco->eco_refcount == 0);
 
         echo_free_object (eco);
@@ -501,7 +508,7 @@ echo_client_kbrw (struct obd_device *obd, int rw,
                                                stripe_off, stripe_id);
                         if (vrc != 0 && rc == 0)
                                 rc = vrc;
-                        
+
                         kunmap(pgp->pg);
                 }
                 __free_pages(pgp->pg, 0);
@@ -512,10 +519,11 @@ echo_client_kbrw (struct obd_device *obd, int rw,
         return (rc);
 }
 
-static int
-echo_client_ubrw (struct obd_device *obd, int rw,
-                  struct obdo *oa, struct lov_stripe_md *lsm,
-                  obd_off offset, obd_size count, char *buffer)
+#ifdef __KERNEL__
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+static int echo_client_ubrw(struct obd_device *obd, int rw,
+                            struct obdo *oa, struct lov_stripe_md *lsm,
+                            obd_off offset, obd_size count, char *buffer)
 {
         struct echo_client_obd *ec = &obd->u.echo_client;
         struct obd_brw_set     *set;
@@ -589,6 +597,16 @@ echo_client_ubrw (struct obd_device *obd, int rw,
         obd_brw_set_free(set);
         return (rc);
 }
+#else
+static int echo_client_ubrw(struct obd_device *obd, int rw,
+                            struct obdo *oa, struct lov_stripe_md *lsm,
+                            obd_off offset, obd_size count, char *buffer)
+{
+        LBUG();
+        return 0;
+}
+#endif
+#endif
 
 static int
 echo_open (struct obd_export *exp, struct obdo *oa)
@@ -599,11 +617,11 @@ echo_open (struct obd_export *exp, struct obdo *oa)
         struct ec_open_object  *ecoo;
         struct ec_object       *eco;
         int                     rc;
-        
+
         rc = echo_get_object (&eco, obd, oa);
         if (rc != 0)
                 return (rc);
-        
+
         rc = -ENOMEM;
         OBD_ALLOC (ecoo, sizeof (*ecoo));
         if (ecoo == NULL)
@@ -612,22 +630,21 @@ echo_open (struct obd_export *exp, struct obdo *oa)
         rc = obd_open (&ec->ec_conn, oa, eco->eco_lsm, NULL);
         if (rc != 0)
                 goto failed_1;
-        
+
         memcpy (&ecoo->ecoo_oa, oa, sizeof (*oa));
         ecoo->ecoo_object = eco;
         /* ecoo takes ref from echo_get_object() above */
 
         spin_lock (&ec->ec_lock);
 
-        list_add (&ecoo->ecoo_exp_chain,
-                  &exp->exp_ec_data.eced_open_head);
-        
+        list_add (&ecoo->ecoo_exp_chain, &exp->exp_ec_data.eced_open_head);
+
         ufh->addr = (__u64)((long) ecoo);
         ufh->cookie = ecoo->ecoo_cookie = ec->ec_unique++;
-        
+
         spin_unlock (&ec->ec_lock);
         return (0);
-        
+
  failed_1:
         OBD_FREE (ecoo, sizeof (*ecoo));
  failed_0:
@@ -645,10 +662,10 @@ echo_close (struct obd_export *exp, struct obdo *oa)
         int                     found = 0;
         struct list_head       *el;
         int                     rc;
-        
+
         if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
                 return (-EINVAL);
-        
+
         spin_lock (&ec->ec_lock);
 
         list_for_each (el, &exp->exp_ec_data.eced_open_head) {
@@ -662,13 +679,13 @@ echo_close (struct obd_export *exp, struct obdo *oa)
         }
 
         spin_unlock (&ec->ec_lock);
-        
+
         if (!found)
                 return (-EINVAL);
 
         rc = obd_close (&ec->ec_conn, &ecoo->ecoo_oa,
                         ecoo->ecoo_object->eco_lsm, NULL);
-        
+
         echo_put_object (ecoo->ecoo_object);
         OBD_FREE (ecoo, sizeof (*ecoo));
 
@@ -691,25 +708,26 @@ echo_ldlm_callback (struct ldlm_lock *lock, struct ldlm_lock_desc *new,
         /* #ifdef this out if we're not feeling paranoid */
         spin_lock (&ec->ec_lock);
         list_for_each (el, &ec->ec_objects) {
-                found = (eco == list_entry (el, struct ec_object, eco_obj_chain));
+                found = (eco == list_entry(el, struct ec_object,
+                                           eco_obj_chain));
                 if (found)
                         break;
         }
         spin_unlock (&ec->ec_lock);
         LASSERT (found);
-        
+
         switch (flag) {
         case LDLM_CB_BLOCKING:
-                CDEBUG (D_INFO, "blocking callback on "LPX64", handle "LPX64"."LPX64"\n", 
-                        eco->eco_id, lockh.addr, lockh.cookie);
+                CDEBUG (D_INFO, "blocking callback on "LPX64", handle "LPX64"."
+                        LPX64"\n", eco->eco_id, lockh.addr, lockh.cookie);
                 rc = ldlm_cli_cancel (&lockh);
                 if (rc != ELDLM_OK)
                         CERROR ("ldlm_cli_cancel failed: %d\n", rc);
                 break;
 
         case LDLM_CB_CANCELING:
-                CDEBUG (D_INFO, "canceling callback on "LPX64", handle "LPX64"."LPX64"\n", 
-                        eco->eco_id, lockh.addr, lockh.cookie);
+                CDEBUG (D_INFO, "canceling callback on "LPX64", handle "LPX64"."
+                        LPX64"\n", eco->eco_id, lockh.addr, lockh.cookie);
                 break;
 
         default:
@@ -751,11 +769,11 @@ echo_enqueue (struct obd_export *exp, struct obdo *oa,
         ecl->ecl_object = eco;
         ecl->ecl_extent.start = offset;
         ecl->ecl_extent.end = (nob == 0) ? ((obd_off)-1) : (offset + nob - 1);
-        
+
         flags = 0;
-        rc = obd_enqueue (&ec->ec_conn, eco->eco_lsm, NULL,
-                          LDLM_EXTENT, &ecl->ecl_extent, sizeof (ecl->ecl_extent),
-                          mode, &flags, echo_ldlm_callback, eco, sizeof (*eco),
+        rc = obd_enqueue (&ec->ec_conn, eco->eco_lsm, NULL, LDLM_EXTENT,
+                          &ecl->ecl_extent,sizeof(ecl->ecl_extent), mode,
+                          &flags, echo_ldlm_callback, eco, sizeof (*eco),
                           &ecl->ecl_handle);
         if (rc != 0)
                 goto failed_1;
@@ -771,7 +789,7 @@ echo_enqueue (struct obd_export *exp, struct obdo *oa,
 
         ulh->addr = (__u64)((long)ecl);
         ulh->cookie = ecl->ecl_cookie = ec->ec_unique++;
-        
+
         spin_unlock (&ec->ec_lock);
 
         oa->o_valid |= OBD_MD_FLHANDLE;
@@ -797,12 +815,12 @@ echo_cancel (struct obd_export *exp, struct obdo *oa)
 
         if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
                 return (-EINVAL);
-        
+
         spin_lock (&ec->ec_lock);
-        
+
         list_for_each (el, &exp->exp_ec_data.eced_locks) {
                 ecl = list_entry (el, struct ec_lock, ecl_exp_chain);
-                
+
                 if ((__u64)((long)ecl) == ulh->addr) {
                         found = (ecl->ecl_cookie == ulh->cookie);
                         if (found)
@@ -810,20 +828,20 @@ echo_cancel (struct obd_export *exp, struct obdo *oa)
                         break;
                 }
         }
-        
+
         spin_unlock (&ec->ec_lock);
-        
+
         if (!found)
                 return (-ENOENT);
-        
-        rc = obd_cancel (&ec->ec_conn, 
+
+        rc = obd_cancel (&ec->ec_conn,
                          ecl->ecl_object->eco_lsm,
                          ecl->ecl_mode,
                          &ecl->ecl_handle);
-        
+
         echo_put_object (ecl->ecl_object);
         OBD_FREE (ecl, sizeof (*ecl));
-        
+
         return (rc);
 }
 
@@ -851,7 +869,7 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn,
         case OBD_IOC_CREATE:                    /* may create echo object */
                 if (!capable (CAP_SYS_ADMIN))
                         GOTO (out, rc = -EPERM);
-                
+
                 rc = echo_create_object (obd, 1, &data->ioc_obdo1,
                                          data->ioc_pbuf1, data->ioc_plen1);
                 GOTO(out, rc);
@@ -859,7 +877,7 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn,
         case OBD_IOC_DESTROY:
                 if (!capable (CAP_SYS_ADMIN))
                         GOTO (out, rc = -EPERM);
-       
+
                 rc = echo_get_object (&eco, obd, &data->ioc_obdo1);
                 if (rc == 0) {
                         rc = obd_destroy(&ec->ec_conn, &data->ioc_obdo1,
@@ -882,7 +900,7 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn,
         case OBD_IOC_SETATTR:
                 if (!capable (CAP_SYS_ADMIN))
                         GOTO (out, rc = -EPERM);
-       
+
                 rc = echo_get_object (&eco, obd, &data->ioc_obdo1);
                 if (rc == 0) {
                         rc = obd_setattr(&ec->ec_conn, &data->ioc_obdo1,
@@ -902,7 +920,7 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn,
         case OBD_IOC_BRW_WRITE:
                 if (!capable (CAP_SYS_ADMIN))
                         GOTO (out, rc = -EPERM);
-       
+
                 rw = OBD_BRW_WRITE;
                 /* fall through */
         case OBD_IOC_BRW_READ:
@@ -914,11 +932,13 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn,
                                                       data->ioc_offset,
                                                       data->ioc_count);
                         else
+#ifdef __KERNEL__
                                 rc = echo_client_ubrw(obd, rw, &data->ioc_obdo1,
                                                       eco->eco_lsm,
                                                       data->ioc_offset,
                                                       data->ioc_count,
                                                       data->ioc_pbuf2);
+#endif
                         echo_put_object(eco);
                 }
                 GOTO(out, rc);
@@ -944,17 +964,18 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn,
                         }
                 } else {
                         rc = echo_create_object(obd, 0, &data->ioc_obdo1,
-                                                data->ioc_pbuf1, data->ioc_plen1);
+                                                data->ioc_pbuf1,
+                                                data->ioc_plen1);
                 }
                 GOTO (out, rc);
 
         case ECHO_IOC_ENQUEUE:
                 if (!capable (CAP_SYS_ADMIN))
                         GOTO (out, rc = -EPERM);
-       
-                rc = echo_enqueue (exp, &data->ioc_obdo1, 
+
+                rc = echo_enqueue (exp, &data->ioc_obdo1,
                                    data->ioc_conn1, /* lock mode */
-                                   data->ioc_offset, data->ioc_count); /* extent */
+                                   data->ioc_offset, data->ioc_count);/*extent*/
                 GOTO (out, rc);
 
         case ECHO_IOC_CANCEL:
@@ -1090,13 +1111,13 @@ static int echo_disconnect(struct lustre_handle *conn)
                 ecl = list_entry (exp->exp_ec_data.eced_locks.next,
                                   struct ec_lock, ecl_exp_chain);
                 list_del (&ecl->ecl_exp_chain);
-                
+
                 rc = obd_cancel (&ec->ec_conn, ecl->ecl_object->eco_lsm,
                                  ecl->ecl_mode, &ecl->ecl_handle);
 
                 CERROR ("Cancel lock on object "LPX64" on disconnect (%d)\n",
                         ecl->ecl_object->eco_id, rc);
-                
+
                 echo_put_object (ecl->ecl_object);
                 OBD_FREE (ecl, sizeof (*ecl));
         }
index bb2870a..6a16001 100644 (file)
@@ -33,9 +33,10 @@ int rd_fstype(char* page, char **start, off_t off, int count, int *eof,
               void *data)
 {
         struct obd_device* dev = (struct obd_device*)data;
-        int rc = snprintf(page, count, "%s\n", dev->u.echo.eo_fstype);
+        
+        LASSERT(dev != NULL);
         *eof = 1;
-        return rc;
+        return snprintf(page, count, "%s\n", dev->u.echo.eo_fstype);
 }
 
 struct lprocfs_vars lprocfs_obd_vars[] = {
index c6658d6..7e17804 100644 (file)
@@ -3,7 +3,7 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-DEFS
+DEFS = $(ENABLE_OST_RECOVERY)
 MODULE = obdfilter
 modulefs_DATA = obdfilter.o
 EXTRA_PROGRAMS = obdfilter
index 591005e..8486c22 100644 (file)
 #include <linux/random.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/lprocfs_status.h>
-
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#include <linux/mount.h>
+#endif
 
 static kmem_cache_t *filter_open_cache;
 static kmem_cache_t *filter_dentry_cache;
@@ -64,6 +67,7 @@ struct xprocfs_io_stat {
         __u64    st_create_reqs;
         __u64    st_destroy_reqs;
         __u64    st_statfs_reqs;
+        __u64    st_syncfs_reqs;
         __u64    st_open_reqs;
         __u64    st_close_reqs;
         __u64    st_punch_reqs;
@@ -77,6 +81,7 @@ do {                                                            \
         xprocfs_iostats[smp_processor_id()].field += (count);   \
 } while (0)
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #define DECLARE_XPROCFS_SUM_STAT(field)                 \
 static long long                                        \
 xprocfs_sum_##field (void)                              \
@@ -98,9 +103,11 @@ DECLARE_XPROCFS_SUM_STAT (st_setattr_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_create_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_destroy_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_statfs_reqs)
+DECLARE_XPROCFS_SUM_STAT (st_syncfs_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_open_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_close_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_punch_reqs)
+#endif
 
 static int
 xprocfs_rd_stat (char *page, char **start, off_t off, int count,
@@ -148,6 +155,7 @@ xprocfs_init (char *name)
                 return;
         }
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         xprocfs_add_stat ("read_bytes",   xprocfs_sum_st_read_bytes);
         xprocfs_add_stat ("read_reqs",    xprocfs_sum_st_read_reqs);
         xprocfs_add_stat ("write_bytes",  xprocfs_sum_st_write_bytes);
@@ -157,9 +165,11 @@ xprocfs_init (char *name)
         xprocfs_add_stat ("create_reqs",  xprocfs_sum_st_create_reqs);
         xprocfs_add_stat ("destroy_reqs", xprocfs_sum_st_destroy_reqs);
         xprocfs_add_stat ("statfs_reqs",  xprocfs_sum_st_statfs_reqs);
+        xprocfs_add_stat ("syncfs_reqs",  xprocfs_sum_st_syncfs_reqs);
         xprocfs_add_stat ("open_reqs",    xprocfs_sum_st_open_reqs);
         xprocfs_add_stat ("close_reqs",   xprocfs_sum_st_close_reqs);
         xprocfs_add_stat ("punch_reqs",   xprocfs_sum_st_punch_reqs);
+#endif
 }
 
 void xprocfs_fini (void)
@@ -176,6 +186,7 @@ void xprocfs_fini (void)
         remove_proc_entry ("create_reqs",  xprocfs_dir);
         remove_proc_entry ("destroy_reqs", xprocfs_dir);
         remove_proc_entry ("statfs_reqs",  xprocfs_dir);
+        remove_proc_entry ("syncfs_reqs",  xprocfs_dir);
         remove_proc_entry ("open_reqs",    xprocfs_dir);
         remove_proc_entry ("close_reqs",   xprocfs_dir);
         remove_proc_entry ("punch_reqs",   xprocfs_dir);
@@ -212,10 +223,12 @@ static void filter_last_rcvd_cb(struct obd_device *obd, __u64 last_rcvd,
 
 void filter_start_transno(struct obd_export *export)
 {
+#ifdef FILTER_TRANSNO_SEM
         struct obd_device * obd = export->exp_obd;
         ENTRY;
 
         down(&obd->u.filter.fo_transno_sem);
+#endif
 }
 
 /* Assumes caller has already pushed us into the kernel context. */
@@ -231,8 +244,16 @@ int filter_finish_transno(struct obd_export *export, void *handle,
         ssize_t written;
 
         /* Propagate error code. */
-        if (rc)
-                GOTO(out, rc);
+        if (rc) {
+#ifdef FILTER_TRANSNO_SEM
+                up(&filter->fo_transno_sem);
+#endif
+                RETURN(rc);
+        }
+
+        if (!(obd->obd_flags & OBD_REPLAYABLE)) {
+                RETURN(0);
+        }
 
         /* we don't allocate new transnos for replayed requests */
 #if 0
@@ -241,13 +262,20 @@ int filter_finish_transno(struct obd_export *export, void *handle,
                 GOTO(out, rc = 0);
 #endif
 
-        off = FILTER_LR_CLIENT_START + fed->fed_lr_off * FILTER_LR_CLIENT_SIZE;
+        off = fed->fed_lr_off;
 
-        last_rcvd = ++filter->fo_fsd->fsd_last_rcvd;
+#ifndef FILTER_TRANSNO_SEM
+        spin_lock(&filter->fo_translock);
+#endif
+        last_rcvd = le64_to_cpu(filter->fo_fsd->fsd_last_rcvd);
+        filter->fo_fsd->fsd_last_rcvd = cpu_to_le64(last_rcvd + 1);
+#ifndef FILTER_TRANSNO_SEM
+        spin_unlock(&filter->fo_translock);
+#endif
         if (oti)
                 oti->oti_transno = last_rcvd;
         fcd->fcd_last_rcvd = cpu_to_le64(last_rcvd);
-        fcd->fcd_mount_count = cpu_to_le64(filter->fo_fsd->fsd_mount_count);
+        fcd->fcd_mount_count = filter->fo_fsd->fsd_mount_count;
 
         /* get this from oti */
 #if 0
@@ -261,27 +289,31 @@ int filter_finish_transno(struct obd_export *export, void *handle,
         written = lustre_fwrite(filter->fo_rcvd_filp, (char *)fcd, sizeof(*fcd),
                                 &off);
         CDEBUG(D_INODE, "wrote trans #"LPD64" for client %s at #%d: written = "
-               LPSZ"\n", last_rcvd, fcd->fcd_uuid, fed->fed_lr_off, written);
+               LPSZ"\n", last_rcvd, fcd->fcd_uuid, fed->fed_lr_idx, written);
 
+#ifdef FILTER_TRANSNO_SEM
+        up(&filter->fo_transno_sem);
+#endif
         if (written == sizeof(*fcd))
-                GOTO(out, rc = 0);
-        CERROR("error writing to last_rcvd file: rc = %d\n", rc);
+                RETURN(0);
+        CERROR("error writing to last_rcvd file: rc = %d\n", written);
         if (written >= 0)
-                GOTO(out, rc = -EIO);
-
-        rc = 0;
-
-        EXIT;
- out:
+                RETURN(-EIO);
 
-        up(&filter->fo_transno_sem);
-        return rc;
+        RETURN(written);
 }
 
 /* write the pathname into the string */
-static int filter_id(char *buf, obd_id id, obd_mode mode)
+static char *filter_id(char *buf, struct filter_obd *filter, obd_id id,
+                     obd_mode mode)
 {
-        return sprintf(buf, "O/%s/"LPU64, obd_mode_to_type(mode), id);
+        if ((mode & S_IFMT) != S_IFREG || filter->fo_subdir_count == 0)
+                sprintf(buf, "O/%s/"LPU64, obd_mode_to_type(mode), id);
+        else
+                sprintf(buf, "O/%s/d%d/"LPU64, obd_mode_to_type(mode),
+                       (int)id & (filter->fo_subdir_count - 1), id);
+
+        return buf;
 }
 
 static inline void f_dput(struct dentry *dentry)
@@ -312,56 +344,60 @@ struct dentry_operations filter_dops = {
 #define FILTER_LR_MAX_CLIENTS (PAGE_SIZE * 8)
 #define FILTER_LR_MAX_CLIENT_WORDS (FILTER_LR_MAX_CLIENTS/sizeof(unsigned long))
 
-static unsigned long filter_last_rcvd_slots[FILTER_LR_MAX_CLIENT_WORDS];
-
 /* Add client data to the FILTER.  We use a bitmap to locate a free space
- * in the last_rcvd file if cl_off is -1 (i.e. a new client).
+ * in the last_rcvd file if cl_idx is -1 (i.e. a new client).
  * Otherwise, we have just read the data from the last_rcvd file and
  * we know its offset.
  */
 int filter_client_add(struct filter_obd *filter,
-                      struct filter_export_data *fed, int cl_off)
+                      struct filter_export_data *fed, int cl_idx)
 {
-        int new_client = (cl_off == -1);
+        int new_client = (cl_idx == -1);
+
+        LASSERT(filter->fo_last_rcvd_slots != NULL);
 
-        /* the bitmap operations can handle cl_off > sizeof(long) * 8, so
+        /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so
          * there's no need for extra complication here
          */
         if (new_client) {
-                cl_off = find_first_zero_bit(filter_last_rcvd_slots,
+                cl_idx = find_first_zero_bit(filter->fo_last_rcvd_slots,
                                              FILTER_LR_MAX_CLIENTS);
         repeat:
-                if (cl_off >= FILTER_LR_MAX_CLIENTS) {
+                if (cl_idx >= FILTER_LR_MAX_CLIENTS) {
                         CERROR("no client slots - fix FILTER_LR_MAX_CLIENTS\n");
                         return -ENOMEM;
                 }
-                if (test_and_set_bit(cl_off, filter_last_rcvd_slots)) {
+                if (test_and_set_bit(cl_idx, filter->fo_last_rcvd_slots)) {
                         CERROR("FILTER client %d: found bit is set in bitmap\n",
-                               cl_off);
-                        cl_off = find_next_zero_bit(filter_last_rcvd_slots,
+                               cl_idx);
+                        cl_idx = find_next_zero_bit(filter->fo_last_rcvd_slots,
                                                     FILTER_LR_MAX_CLIENTS,
-                                                    cl_off);
+                                                    cl_idx);
                         goto repeat;
                 }
         } else {
-                if (test_and_set_bit(cl_off, filter_last_rcvd_slots)) {
+                if (test_and_set_bit(cl_idx, filter->fo_last_rcvd_slots)) {
                         CERROR("FILTER client %d: bit already set in bitmap!\n",
-                               cl_off);
+                               cl_idx);
                         LBUG();
                 }
         }
 
-        CDEBUG(D_INFO, "client at offset %d with UUID '%s' added\n",
-               cl_off, fed->fed_fcd->fcd_uuid);
+        fed->fed_lr_idx = cl_idx;
+        fed->fed_lr_off = le32_to_cpu(filter->fo_fsd->fsd_client_start) +
+                cl_idx * le16_to_cpu(filter->fo_fsd->fsd_client_size);
 
-        fed->fed_lr_off = cl_off;
+        CDEBUG(D_INFO, "client at index %d (%llu) with UUID '%s' added\n",
+               fed->fed_lr_idx, fed->fed_lr_off, fed->fed_fcd->fcd_uuid);
 
         if (new_client) {
                 struct obd_run_ctxt saved;
-                loff_t off = FILTER_LR_CLIENT_START +
-                        (cl_off * FILTER_LR_CLIENT_SIZE);
+                loff_t off = fed->fed_lr_off;
                 ssize_t written;
 
+                CDEBUG(D_INFO, "writing client fcd at idx %u (%llu) (len %u)\n",
+                       fed->fed_lr_idx,off,(unsigned int)sizeof(*fed->fed_fcd));
+
                 push_ctxt(&saved, &filter->fo_ctxt, NULL);
                 written = lustre_fwrite(filter->fo_rcvd_filp,
                                                 (char *)fed->fed_fcd,
@@ -373,9 +409,6 @@ int filter_client_add(struct filter_obd *filter,
                                 RETURN(written);
                         RETURN(-EIO);
                 }
-                CDEBUG(D_INFO, "wrote client fcd at off %u (len %u)\n",
-                       FILTER_LR_CLIENT_START + (cl_off*FILTER_LR_CLIENT_SIZE),
-                       (unsigned int)sizeof(*fed->fed_fcd));
         }
         return 0;
 }
@@ -392,14 +425,16 @@ int filter_client_free(struct obd_export *exp)
         if (!fed->fed_fcd)
                 RETURN(0);
 
-        off = FILTER_LR_CLIENT_START + (fed->fed_lr_off*FILTER_LR_CLIENT_SIZE);
+        LASSERT(filter->fo_last_rcvd_slots != NULL);
+
+        off = fed->fed_lr_off;
 
-        CDEBUG(D_INFO, "freeing client at offset %u (%lld)with UUID '%s'\n",
-               fed->fed_lr_off, off, fed->fed_fcd->fcd_uuid);
+        CDEBUG(D_INFO, "freeing client at idx %u (%lld) with UUID '%s'\n",
+               fed->fed_lr_idx, fed->fed_lr_off, fed->fed_fcd->fcd_uuid);
 
-        if (!test_and_clear_bit(fed->fed_lr_off, filter_last_rcvd_slots)) {
+        if (!test_and_clear_bit(fed->fed_lr_idx, filter->fo_last_rcvd_slots)) {
                 CERROR("FILTER client %u: bit already clear in bitmap!!\n",
-                       fed->fed_lr_off);
+                       fed->fed_lr_idx);
                 LBUG();
         }
 
@@ -409,21 +444,17 @@ int filter_client_free(struct obd_export *exp)
                                 sizeof(zero_fcd), &off);
 
         /* XXX: this write gets lost sometimes, unless this sync is here. */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        fsync_dev(filter->fo_rcvd_filp->f_dentry->d_inode->i_rdev);
-#else
-        file_fsync(filter->fo_rcvd_filp,  filter->fo_rcvd_filp->f_dentry, 1);
-#endif
+        file_fsync(filter->fo_rcvd_filp, filter->fo_rcvd_filp->f_dentry, 1);
         pop_ctxt(&saved, &filter->fo_ctxt, NULL);
 
         if (written != sizeof(zero_fcd)) {
-                CERROR("error zeroing out client %s off %d in %s: %d\n",
-                       fed->fed_fcd->fcd_uuid, fed->fed_lr_off, LAST_RCVD,
-                       written);
+                CERROR("error zeroing out client %s idx %u (%llu) in %s: %d\n",
+                       fed->fed_fcd->fcd_uuid, fed->fed_lr_idx, fed->fed_lr_off,
+                       LAST_RCVD, written);
         } else {
                 CDEBUG(D_INFO,
-                       "zeroed disconnecting client %s at off %d ("LPX64")\n",
-                       fed->fed_fcd->fcd_uuid, fed->fed_lr_off, off);
+                       "zeroed disconnecting client %s at idx %u (%llu)\n",
+                       fed->fed_fcd->fcd_uuid, fed->fed_lr_idx,fed->fed_lr_off);
         }
 
         OBD_FREE(fed->fed_fcd, sizeof(*fed->fed_fcd));
@@ -431,49 +462,34 @@ int filter_client_free(struct obd_export *exp)
         return 0;
 }
 
-static void filter_unpack_fsd(struct filter_server_data *fsd)
-{
-        fsd->fsd_last_objid = le64_to_cpu(fsd->fsd_last_objid);
-        fsd->fsd_last_rcvd = le64_to_cpu(fsd->fsd_last_rcvd);
-        fsd->fsd_mount_count = le64_to_cpu(fsd->fsd_mount_count);
-}
-
-static void filter_pack_fsd(struct filter_server_data *disk_fsd,
-                            struct filter_server_data *fsd)
-{
-        memset(disk_fsd, 0, sizeof(*disk_fsd));
-        memcpy(disk_fsd->fsd_uuid, fsd->fsd_uuid, sizeof(fsd->fsd_uuid));
-        disk_fsd->fsd_last_objid = cpu_to_le64(fsd->fsd_last_objid);
-        disk_fsd->fsd_last_rcvd = cpu_to_le64(fsd->fsd_last_rcvd);
-        disk_fsd->fsd_mount_count = cpu_to_le64(fsd->fsd_mount_count);
-}
-
 static int filter_free_server_data(struct filter_obd *filter)
 {
         OBD_FREE(filter->fo_fsd, sizeof(*filter->fo_fsd));
         filter->fo_fsd = NULL;
-
+        OBD_FREE(filter->fo_last_rcvd_slots,
+                 FILTER_LR_MAX_CLIENT_WORDS * sizeof(unsigned long));
+        filter->fo_last_rcvd_slots = NULL;
         return 0;
 }
 
 
-/* assumes caller has already in kernel ctxt */
+/* assumes caller is already in kernel ctxt */
 static int filter_update_server_data(struct file *filp,
                                      struct filter_server_data *fsd)
 {
-        struct filter_server_data disk_fsd;
         loff_t off = 0;
         int rc;
 
         CDEBUG(D_INODE, "server uuid      : %s\n", fsd->fsd_uuid);
-        CDEBUG(D_INODE, "server last_objid: "LPU64"\n", fsd->fsd_last_objid);
-        CDEBUG(D_INODE, "server last_rcvd : "LPU64"\n", fsd->fsd_last_rcvd);
-        CDEBUG(D_INODE, "server last_mount: "LPU64"\n", fsd->fsd_mount_count);
-
-        filter_pack_fsd(&disk_fsd, fsd);
-        rc = lustre_fwrite(filp, (char *)&disk_fsd,
-                           sizeof(disk_fsd), &off);
-        if (rc != sizeof(disk_fsd)) {
+        CDEBUG(D_INODE, "server last_objid: "LPU64"\n",
+               le64_to_cpu(fsd->fsd_last_objid));
+        CDEBUG(D_INODE, "server last_rcvd : "LPU64"\n",
+               le64_to_cpu(fsd->fsd_last_rcvd));
+        CDEBUG(D_INODE, "server last_mount: "LPU64"\n",
+               le64_to_cpu(fsd->fsd_mount_count));
+
+        rc = lustre_fwrite(filp, (char *)fsd, sizeof(*fsd), &off);
+        if (rc != sizeof(*fsd)) {
                 CDEBUG(D_INODE, "error writing filter_server_data: rc = %d\n",
                        rc);
                 RETURN(-EIO);
@@ -482,8 +498,7 @@ static int filter_update_server_data(struct file *filp,
 }
 
 /* assumes caller has already in kernel ctxt */
-static int filter_init_server_data(struct obd_device *obd,
-                                   struct file * filp,
+static int filter_init_server_data(struct obd_device *obd, struct file * filp,
                                    __u64 init_lastobjid)
 {
         struct filter_obd *filter = &obd->u.filter;
@@ -491,7 +506,8 @@ static int filter_init_server_data(struct obd_device *obd,
         struct filter_client_data *fcd = NULL;
         struct inode *inode = filp->f_dentry->d_inode;
         unsigned long last_rcvd_size = inode->i_size;
-        int cl_off;
+        __u64 mount_count;
+        int cl_idx;
         loff_t off = 0;
         int rc;
 
@@ -506,125 +522,161 @@ static int filter_init_server_data(struct obd_device *obd,
                 RETURN(-ENOMEM);
         filter->fo_fsd = fsd;
 
+        OBD_ALLOC(filter->fo_last_rcvd_slots, 
+                  FILTER_LR_MAX_CLIENT_WORDS * sizeof(unsigned long));
+        if (filter->fo_last_rcvd_slots == NULL) {
+                OBD_FREE(fsd, sizeof(*fsd));
+                RETURN(-ENOMEM);
+        }
+
         if (last_rcvd_size == 0) {
                 CERROR("%s: initializing new last_rcvd\n", obd->obd_name);
 
                 memcpy(fsd->fsd_uuid, obd->obd_uuid.uuid,sizeof(fsd->fsd_uuid));
-                fsd->fsd_last_objid = init_lastobjid;
+                fsd->fsd_last_objid = cpu_to_le64(init_lastobjid);
                 fsd->fsd_last_rcvd = 0;
-                fsd->fsd_mount_count = 0;
-
+                mount_count = fsd->fsd_mount_count = 0;
+                fsd->fsd_server_size = cpu_to_le32(FILTER_LR_SERVER_SIZE);
+                fsd->fsd_client_start = cpu_to_le32(FILTER_LR_CLIENT_START);
+                fsd->fsd_client_size = cpu_to_le16(FILTER_LR_CLIENT_SIZE);
+                fsd->fsd_subdir_count = cpu_to_le16(FILTER_SUBDIR_COUNT);
+                filter->fo_subdir_count = FILTER_SUBDIR_COUNT;
         } else {
-                ssize_t  retval = lustre_fread(filp, (char *)fsd,
-                                              sizeof(*fsd),
+                ssize_t retval = lustre_fread(filp, (char *)fsd, sizeof(*fsd),
                                               &off);
                 if (retval != sizeof(*fsd)) {
                         CDEBUG(D_INODE,"OBD filter: error reading lastobjid\n");
                         GOTO(out, rc = -EIO);
                 }
-                filter_unpack_fsd(fsd);
+                mount_count = le64_to_cpu(fsd->fsd_mount_count);
+                filter->fo_subdir_count = le16_to_cpu(fsd->fsd_subdir_count);
+        }
+
+        if (fsd->fsd_feature_incompat) {
+                CERROR("unsupported feature %x\n",
+                       le32_to_cpu(fsd->fsd_feature_incompat));
+                RETURN(-EINVAL);
+        }
+        if (fsd->fsd_feature_rocompat) {
+                CERROR("read-only feature %x\n",
+                       le32_to_cpu(fsd->fsd_feature_rocompat));
+                /* Do something like remount filesystem read-only */
+                RETURN(-EINVAL);
         }
 
         CDEBUG(D_INODE, "%s: server last_objid: "LPU64"\n",
-               obd->obd_name, fsd->fsd_last_objid);
+               obd->obd_name, le64_to_cpu(fsd->fsd_last_objid));
         CDEBUG(D_INODE, "%s: server last_rcvd : "LPU64"\n",
-               obd->obd_name, fsd->fsd_last_rcvd);
+               obd->obd_name, le64_to_cpu(fsd->fsd_last_rcvd));
         CDEBUG(D_INODE, "%s: server last_mount: "LPU64"\n",
-               obd->obd_name, fsd->fsd_mount_count);
+               obd->obd_name, mount_count);
+        CDEBUG(D_INODE, "%s: server data size: %u\n",
+               obd->obd_name, le32_to_cpu(fsd->fsd_server_size));
+        CDEBUG(D_INODE, "%s: per-client data start: %u\n",
+               obd->obd_name, le32_to_cpu(fsd->fsd_client_start));
+        CDEBUG(D_INODE, "%s: per-client data size: %u\n",
+               obd->obd_name, le32_to_cpu(fsd->fsd_client_size));
+        CDEBUG(D_INODE, "%s: server subdir_count: %u\n",
+               obd->obd_name, le16_to_cpu(fsd->fsd_subdir_count));
 
         /*
          * When we do a clean FILTER shutdown, we save the last_rcvd into
          * the header.  If we find clients with higher last_rcvd values
          * then those clients may need recovery done.
          */
-        /* off is adjusted by lustre_fread, so we don't adjust it in the loop */
-       for (off = FILTER_LR_CLIENT_START, cl_off = 0; off < last_rcvd_size;
-            cl_off++) {
-                __u64 last_rcvd;
-                int mount_age;
-
-                if (!fcd) {
-                        OBD_ALLOC(fcd, sizeof(*fcd));
-                        if (!fcd)
-                                GOTO(err_fsd, rc = -ENOMEM);
-                }
-
-                rc = lustre_fread(filp, (char *)fcd, sizeof(*fcd), &off);
-                if (rc != sizeof(*fcd)) {
-                        CERROR("error reading FILTER %s offset %d: rc = %d\n",
-                               LAST_RCVD, cl_off, rc);
-                        if (rc > 0) /* XXX fatal error or just abort reading? */
-                                rc = -EIO;
-                        break;
-                }
-
-                if (fcd->fcd_uuid[0] == '\0') {
-                        CDEBUG(D_INFO, "skipping zeroed client at offset %d\n",
-                               cl_off);
-                        continue;
-                }
+        if (obd->obd_flags & OBD_REPLAYABLE) {
+                for (cl_idx = 0; off < last_rcvd_size; cl_idx++) {
+                        __u64 last_rcvd;
+                        int mount_age;
+
+                        if (!fcd) {
+                                OBD_ALLOC(fcd, sizeof(*fcd));
+                                if (!fcd)
+                                        GOTO(err_fsd, rc = -ENOMEM);
+                        }
 
-                last_rcvd = le64_to_cpu(fcd->fcd_last_rcvd);
+                        /* Don't assume off is incremented properly, in case
+                         * sizeof(fsd) isn't the same as fsd->fsd_client_size.
+                         */
+                        off = le32_to_cpu(fsd->fsd_client_start) +
+                                cl_idx * le16_to_cpu(fsd->fsd_client_size);
+                        rc = lustre_fread(filp, (char *)fcd, sizeof(*fcd), &off);
+                        if (rc != sizeof(*fcd)) {
+                                CERROR("error reading FILTER %s offset %d: rc = %d\n",
+                                       LAST_RCVD, cl_idx, rc);
+                                if (rc > 0) /* XXX fatal error or just abort reading? */
+                                        rc = -EIO;
+                                break;
+                        }
 
-                /* These exports are cleaned up by filter_disconnect(), so they
-                 * need to be set up like real exports as filter_connect() does.
-                 */
-                mount_age = fsd->fsd_mount_count -
-                        le64_to_cpu(fcd->fcd_mount_count);
-                if (mount_age < FILTER_MOUNT_RECOV) {
-                        CERROR("RCVRNG CLIENT uuid: %s off: %d lr: "LPU64
-                               "srv lr: "LPU64" mnt: "LPU64" last mount: "LPU64
-                               "\n", fcd->fcd_uuid, cl_off,
-                               last_rcvd, fsd->fsd_last_rcvd,
-                               le64_to_cpu(fcd->fcd_mount_count),
-                               fsd->fsd_mount_count);
-#if 0
-                        /* disabled until OST recovery is actually working */
-                        struct obd_export *exp = class_new_export(obd);
-                        struct filter_export_data *fed;
+                        if (fcd->fcd_uuid[0] == '\0') {
+                                CDEBUG(D_INFO, "skipping zeroed client at offset %d\n",
+                                       cl_idx);
+                                continue;
+                        }
 
-                        if (!exp) {
-                                rc = -ENOMEM;
-                                break;
+                        last_rcvd = le64_to_cpu(fcd->fcd_last_rcvd);
+
+                        /* These exports are cleaned up by filter_disconnect(), so they
+                         * need to be set up like real exports as filter_connect() does.
+                         */
+                        mount_age = mount_count - le64_to_cpu(fcd->fcd_mount_count);
+                        if (mount_age < FILTER_MOUNT_RECOV) {
+                                struct obd_export *exp = class_new_export(obd);
+                                struct filter_export_data *fed;
+                                CERROR("RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64
+                                       " srv lr: "LPU64" mnt: "LPU64" last mount: "
+                                       LPU64"\n", fcd->fcd_uuid, cl_idx,
+                                       last_rcvd, le64_to_cpu(fsd->fsd_last_rcvd),
+                                       le64_to_cpu(fcd->fcd_mount_count), mount_count);
+                                /* disabled until OST recovery is actually working */
+
+                                if (!exp) {
+                                        rc = -ENOMEM;
+                                        break;
+                                }
+                                memcpy(&exp->exp_client_uuid.uuid, fcd->fcd_uuid,
+                                       sizeof exp->exp_client_uuid.uuid);
+                                fed = &exp->exp_filter_data;
+                                fed->fed_fcd = fcd;
+                                filter_client_add(filter, fed, cl_idx);
+                                /* create helper if export init gets more complex */
+                                INIT_LIST_HEAD(&fed->fed_open_head);
+                                spin_lock_init(&fed->fed_lock);
+
+                                fcd = NULL;
+                                obd->obd_recoverable_clients++;
+                        } else {
+                                CDEBUG(D_INFO,
+                                       "discarded client %d UUID '%s' count "LPU64"\n",
+                                       cl_idx, fcd->fcd_uuid,
+                                       le64_to_cpu(fcd->fcd_mount_count));
                         }
 
-                        fed = &exp->exp_filter_data;
-                        fed->fed_fcd = fcd;
-                        filter_client_add(filter, fed, cl_off);
-                        /* create helper if export init gets more complex */
-                        INIT_LIST_HEAD(&fed->fed_open_head);
-                        spin_lock_init(&fed->fed_lock);
+                        CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n",
+                               cl_idx, last_rcvd);
 
-                        fcd = NULL;
-                        filter->fo_recoverable_clients++;
-#endif
-                } else {
-                        CDEBUG(D_INFO,
-                               "discarded client %d, UUID '%s', count %Ld\n",
-                               cl_off, fcd->fcd_uuid,
-                               (long long)le64_to_cpu(fcd->fcd_mount_count));
+                        if (last_rcvd > le64_to_cpu(filter->fo_fsd->fsd_last_rcvd))
+                                filter->fo_fsd->fsd_last_rcvd = cpu_to_le64(last_rcvd);
                 }
 
-                CDEBUG(D_OTHER, "client at offset %d has last_rcvd = %Lu\n",
-                       cl_off, (unsigned long long)last_rcvd);
+                obd->obd_last_committed = le64_to_cpu(filter->fo_fsd->fsd_last_rcvd);
+                if (obd->obd_recoverable_clients) {
+                        CERROR("RECOVERY: %d recoverable clients, last_rcvd "LPU64"\n",
+                               obd->obd_recoverable_clients,
+                               le64_to_cpu(filter->fo_fsd->fsd_last_rcvd));
+                        obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
+                        obd->obd_flags |= OBD_RECOVERING;
+                }
 
-                if (last_rcvd > filter->fo_fsd->fsd_last_rcvd)
-                        filter->fo_fsd->fsd_last_rcvd = last_rcvd;
-        }
+                if (fcd)
+                        OBD_FREE(fcd, sizeof(*fcd));
 
-        obd->obd_last_committed = filter->fo_fsd->fsd_last_rcvd;
-        if (filter->fo_recoverable_clients) {
-                CERROR("RECOVERY: %d recoverable clients, last_rcvd "LPU64"\n",
-                       filter->fo_recoverable_clients,
-                       filter->fo_fsd->fsd_last_rcvd);
-                filter->fo_next_recovery_transno = obd->obd_last_committed + 1;
-                obd->obd_flags |= OBD_RECOVERING;
+        } else {
+                CERROR("%s: recovery support OFF\n", obd->obd_name);
         }
 
-        if (fcd)
-                OBD_FREE(fcd, sizeof(*fcd));
-
-        fsd->fsd_mount_count++;
+        fsd->fsd_mount_count = cpu_to_le64(mount_count + 1);
 
         /* save it,so mount count and last_recvd is current */
         rc = filter_update_server_data(filp, filter->fo_fsd);
@@ -642,9 +694,10 @@ static int filter_prep(struct obd_device *obd)
 {
         struct obd_run_ctxt saved;
         struct filter_obd *filter = &obd->u.filter;
-        struct dentry *dentry;
+        struct dentry *dentry, *O_dentry;
         struct file *file;
         struct inode *inode;
+        int i;
         int rc = 0;
         int mode = 0;
 
@@ -662,6 +715,7 @@ static int filter_prep(struct obd_device *obd)
          * Create directories and/or get dentries for each object type.
          * This saves us from having to do multiple lookups for each one.
          */
+        O_dentry = filter->fo_dentry_O;
         for (mode = 0; mode < (S_IFMT >> S_SHIFT); mode++) {
                 char *name = obd_type_by_mode[mode];
 
@@ -669,22 +723,22 @@ static int filter_prep(struct obd_device *obd)
                         filter->fo_dentry_O_mode[mode] = NULL;
                         continue;
                 }
-                dentry = simple_mkdir(filter->fo_dentry_O, name, 0700);
+                dentry = simple_mkdir(O_dentry, name, 0700);
                 CDEBUG(D_INODE, "got/created O/%s: %p\n", name, dentry);
                 if (IS_ERR(dentry)) {
                         rc = PTR_ERR(dentry);
                         CERROR("cannot create O/%s: rc = %d\n", name, rc);
-                        GOTO(out_O_mode, rc);
+                        GOTO(err_O_mode, rc);
                 }
                 filter->fo_dentry_O_mode[mode] = dentry;
         }
 
         file = filp_open(LAST_RCVD, O_RDWR | O_CREAT, 0700);
-        if ( !file || IS_ERR(file) ) {
+        if (!file || IS_ERR(file)) {
                 rc = PTR_ERR(file);
                 CERROR("OBD filter: cannot open/create %s: rc = %d\n",
                        LAST_RCVD, rc);
-                GOTO(out_O_mode, rc);
+                GOTO(err_O_mode, rc);
         }
 
         if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
@@ -711,19 +765,50 @@ static int filter_prep(struct obd_device *obd)
         }
         filter->fo_rcvd_filp = file;
 
+        if (filter->fo_subdir_count) {
+                O_dentry = filter->fo_dentry_O_mode[S_IFREG >> S_SHIFT];
+                OBD_ALLOC(filter->fo_dentry_O_sub,
+                          FILTER_SUBDIR_COUNT * sizeof(dentry));
+                if (!filter->fo_dentry_O_sub)
+                        GOTO(err_client, rc = -ENOMEM);
+
+                for (i = 0; i < filter->fo_subdir_count; i++) {
+                        char dir[20];
+                        snprintf(dir, sizeof(dir), "d%u", i);
+
+                        dentry = simple_mkdir(O_dentry, dir, 0700);
+                        CDEBUG(D_INODE, "got/created O/R/%s: %p\n", dir,dentry);
+                        if (IS_ERR(dentry)) {
+                                rc = PTR_ERR(dentry);
+                                CERROR("can't create O/R/%s: rc = %d\n",dir,rc);
+                                GOTO(err_O_sub, rc);
+                        }
+                        filter->fo_dentry_O_sub[i] = dentry;
+                }
+        }
         rc = 0;
  out:
         pop_ctxt(&saved, &filter->fo_ctxt, NULL);
 
         return(rc);
 
+err_O_sub:
+        while (i-- > 0) {
+                struct dentry *dentry = filter->fo_dentry_O_sub[i];
+                if (dentry) {
+                        f_dput(dentry);
+                        filter->fo_dentry_O_sub[i] = NULL;
+                }
+        }
+        OBD_FREE(filter->fo_dentry_O_sub,
+                 filter->fo_subdir_count * sizeof(dentry));
 err_client:
         class_disconnect_all(obd);
 err_filp:
         if (filp_close(file, 0))
                 CERROR("can't close %s after error\n", LAST_RCVD);
         filter->fo_rcvd_filp = NULL;
- out_O_mode:
+err_O_mode:
         while (mode-- > 0) {
                 struct dentry *dentry = filter->fo_dentry_O_mode[mode];
                 if (dentry) {
@@ -752,23 +837,28 @@ static void filter_post(struct obd_device *obd)
         rc = filter_update_server_data(filter->fo_rcvd_filp, filter->fo_fsd);
         if (rc)
                 CERROR("OBD filter: error writing lastobjid: rc = %ld\n", rc);
-        filter_free_server_data(filter);
 
 
         if (filter->fo_rcvd_filp) {
-                /* broken sync at umount bug workaround  */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                rc = fsync_dev(filter->fo_rcvd_filp->f_dentry->d_inode->i_rdev);
-#else
                 rc = file_fsync(filter->fo_rcvd_filp,
                                 filter->fo_rcvd_filp->f_dentry, 1);
-#endif
                 filp_close(filter->fo_rcvd_filp, 0);
                 filter->fo_rcvd_filp = NULL;
                 if (rc)
-                        CERROR("last_rcvd file won't closek rc = %ld\n", rc);
+                        CERROR("last_rcvd file won't closed rc = %ld\n", rc);
         }
 
+        if (filter->fo_subdir_count) {
+                int i;
+                for (i = 0; i < filter->fo_subdir_count; i++) {
+                        struct dentry *dentry = filter->fo_dentry_O_sub[i];
+                        f_dput(dentry);
+                        filter->fo_dentry_O_sub[i] = NULL;
+                }
+                OBD_FREE(filter->fo_dentry_O_sub,
+                         filter->fo_subdir_count *
+                         sizeof(*filter->fo_dentry_O_sub));
+        }
         for (mode = 0; mode < (S_IFMT >> S_SHIFT); mode++) {
                 struct dentry *dentry = filter->fo_dentry_O_mode[mode];
                 if (dentry) {
@@ -777,6 +867,7 @@ static void filter_post(struct obd_device *obd)
                 }
         }
         f_dput(filter->fo_dentry_O);
+        filter_free_server_data(filter);
         pop_ctxt(&saved, &filter->fo_ctxt, NULL);
 }
 
@@ -787,7 +878,8 @@ static __u64 filter_next_id(struct obd_device *obd)
         LASSERT(obd->u.filter.fo_fsd != NULL);
 
         spin_lock(&obd->u.filter.fo_objidlock);
-        id = ++obd->u.filter.fo_fsd->fsd_last_objid;
+        id = le64_to_cpu(obd->u.filter.fo_fsd->fsd_last_objid);
+        obd->u.filter.fo_fsd->fsd_last_objid = cpu_to_le64(id + 1);
         spin_unlock(&obd->u.filter.fo_objidlock);
 
         return id;
@@ -839,12 +931,15 @@ static struct dentry *filter_fid2dentry(struct obd_device *obd,
 }
 
 static inline struct dentry *filter_parent(struct obd_device *obd,
-                                           obd_mode mode)
+                                           obd_mode mode, obd_id objid)
 {
         struct filter_obd *filter = &obd->u.filter;
 
         LASSERT((mode & S_IFMT) == S_IFREG);   /* only regular files for now */
-        return filter->fo_dentry_O_mode[(mode & S_IFMT) >> S_SHIFT];
+        if ((mode & S_IFMT) != S_IFREG || filter->fo_subdir_count == 0)
+                return filter->fo_dentry_O_mode[(mode & S_IFMT) >> S_SHIFT];
+
+        return filter->fo_dentry_O_sub[objid & (filter->fo_subdir_count - 1)];
 }
 
 static struct file *filter_obj_open(struct obd_export *export,
@@ -890,9 +985,9 @@ static struct file *filter_obj_open(struct obd_export *export,
                 GOTO(out_ffd, file = ERR_PTR(-ENOMEM));
         }
 
-        filter_id(name, id, type);
         push_ctxt(&saved, &filter->fo_ctxt, NULL);
-        file = filp_open(name, O_RDWR | O_LARGEFILE, 0 /* type? */);
+        file = filp_open(filter_id(name, filter, id, type),
+                         O_RDWR | O_LARGEFILE, type);
         pop_ctxt(&saved, &filter->fo_ctxt, NULL);
 
         if (IS_ERR(file)) {
@@ -909,11 +1004,12 @@ static struct file *filter_obj_open(struct obd_export *export,
                 LASSERT(kmem_cache_validate(filter_dentry_cache, fdd));
                 /* should only happen during client recovery */
                 if (fdd->fdd_flags & FILTER_FLAG_DESTROY)
-                        CDEBUG(D_INODE,"opening destroyed object "LPX64"\n",id);
+                        CDEBUG(D_INODE,"opening destroyed object "LPU64"\n",id);
                 atomic_inc(&fdd->fdd_open_count);
         } else {
                 atomic_set(&fdd->fdd_open_count, 1);
                 fdd->fdd_flags = 0;
+                fdd->fdd_objid = id;
                 /* If this is racy, then we can use {cmp}xchg and atomic_add */
                 dentry->d_fsdata = fdd;
                 spin_unlock(&filter->fo_fddlock);
@@ -921,6 +1017,7 @@ static struct file *filter_obj_open(struct obd_export *export,
 
         get_random_bytes(&ffd->ffd_servercookie, sizeof(ffd->ffd_servercookie));
         ffd->ffd_file = file;
+        LASSERT(file->private_data == NULL);
         file->private_data = ffd;
 
         if (!dentry->d_op)
@@ -932,7 +1029,7 @@ static struct file *filter_obj_open(struct obd_export *export,
         list_add(&ffd->ffd_export_list, &fed->fed_open_head);
         spin_unlock(&fed->fed_lock);
 
-        CDEBUG(D_INODE, "opened objid "LPX64": rc = %p\n", id, file);
+        CDEBUG(D_INODE, "opened objid "LPU64": rc = %p\n", id, file);
         EXIT;
 out:
         return file;
@@ -991,7 +1088,7 @@ static int filter_close_internal(struct obd_export *export,
 
         if (atomic_dec_and_test(&fdd->fdd_open_count) &&
             fdd->fdd_flags & FILTER_FLAG_DESTROY) {
-                struct dentry *dir_dentry = filter_parent(obd, S_IFREG);
+                struct dentry *dir_dentry = filter_parent(obd, S_IFREG, fdd->fdd_objid);
                 struct obd_run_ctxt saved;
                 void *handle;
 
@@ -1029,7 +1126,8 @@ static int filter_close_internal(struct obd_export *export,
 
 /* obd methods */
 /* mount the file system (secretly) */
-static int filter_setup(struct obd_device *obd, obd_count len, void *buf)
+static int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
+                               char *option)
 {
         struct obd_ioctl_data* data = buf;
         struct filter_obd *filter;
@@ -1038,21 +1136,25 @@ static int filter_setup(struct obd_device *obd, obd_count len, void *buf)
         ENTRY;
 
         if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2)
-                RETURN(rc = -EINVAL);
+                RETURN(-EINVAL);
 
         obd->obd_fsops = fsfilt_get_ops(data->ioc_inlbuf2);
         if (IS_ERR(obd->obd_fsops))
-                RETURN(rc = PTR_ERR(obd->obd_fsops));
+                RETURN(PTR_ERR(obd->obd_fsops));
 
-        mnt = do_kern_mount(data->ioc_inlbuf2, 0, data->ioc_inlbuf1, NULL);
+        mnt = do_kern_mount(data->ioc_inlbuf2, 0, data->ioc_inlbuf1, option);
         rc = PTR_ERR(mnt);
-        if (IS_ERR(mnt))
+        if (IS_ERR(mnt)) {
+                CERROR("mount of %s as type %s failed: rc %d\n",
+                       data->ioc_inlbuf2, data->ioc_inlbuf1, rc);
                 GOTO(err_ops, rc);
+        }
 
+#if OST_RECOVERY
         obd->obd_flags |= OBD_REPLAYABLE;
+#endif
 
         filter = &obd->u.filter;;
-        init_MUTEX(&filter->fo_transno_sem);
         filter->fo_vfsmnt = mnt;
         filter->fo_fstype = strdup(data->ioc_inlbuf2);
         filter->fo_sb = mnt->mnt_root->d_inode->i_sb;
@@ -1067,6 +1169,11 @@ static int filter_setup(struct obd_device *obd, obd_count len, void *buf)
         if (rc)
                 GOTO(err_kfree, rc);
 
+#ifdef FILTER_TRANSNO_SEM
+        init_MUTEX(&filter->fo_transno_sem);
+#else
+        spin_lock_init(&filter->fo_translock);
+#endif
         spin_lock_init(&filter->fo_fddlock);
         spin_lock_init(&filter->fo_objidlock);
         INIT_LIST_HEAD(&filter->fo_export_list);
@@ -1094,6 +1201,29 @@ err_ops:
         return rc;
 }
 
+static int filter_setup(struct obd_device *obd, obd_count len, void *buf)
+{
+        return filter_common_setup(obd, len, buf, NULL);
+}
+
+/* sanobd setup methods - use a specific mount option */
+static int filter_san_setup(struct obd_device *obd, obd_count len, void *buf)
+{
+        struct obd_ioctl_data* data = buf;
+        char *option = NULL;
+
+        if (!data->ioc_inlbuf2)
+                RETURN(-EINVAL);
+
+        /* for extN/ext3 filesystem, we must mount it with 'writeback' mode */
+        if (!strcmp(data->ioc_inlbuf2, "extN") ||
+            !strcmp(data->ioc_inlbuf2, "ext3"))
+                option = "data=writeback";
+        else
+                LBUG(); /* just a reminder */
+
+        return filter_common_setup(obd, len, buf, option);
+}
 
 static int filter_cleanup(struct obd_device *obd)
 {
@@ -1178,9 +1308,11 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd,
         INIT_LIST_HEAD(&exp->exp_filter_data.fed_open_head);
         spin_lock_init(&exp->exp_filter_data.fed_lock);
 
-        rc = filter_client_add(filter, fed, -1);
-        if (rc)
-                GOTO(out_fcd, rc);
+        if (obd->obd_flags & OBD_REPLAYABLE) {
+                rc = filter_client_add(filter, fed, -1);
+                if (rc)
+                        GOTO(out_fcd, rc);
+        }
 
         RETURN(rc);
 
@@ -1222,7 +1354,9 @@ static int filter_disconnect(struct lustre_handle *conn)
         spin_unlock(&fed->fed_lock);
 
         ldlm_cancel_locks_for_export(exp);
-        filter_client_free(exp);
+
+        if (exp->exp_obd->obd_flags & OBD_REPLAYABLE) 
+                filter_client_free(exp);
 
         rc = class_disconnect(conn);
 
@@ -1235,7 +1369,7 @@ static void filter_from_inode(struct obdo *oa, struct inode *inode, int valid)
         int type = oa->o_mode & S_IFMT;
         ENTRY;
 
-        CDEBUG(D_INFO, "src inode %lu (%p), dst obdo "LPX64" valid 0x%08x\n",
+        CDEBUG(D_INFO, "src inode %lu (%p), dst obdo "LPU64" valid 0x%08x\n",
                inode->i_ino, inode, oa->o_id, valid);
         /* Don't copy the inode number in place of the object ID */
         obdo_from_inode(oa, inode, valid);
@@ -1289,17 +1423,18 @@ static struct dentry *__filter_oa2dentry(struct lustre_handle *conn,
                         CERROR("invalid client "LPX64"\n", conn->addr);
                         RETURN(ERR_PTR(-EINVAL));
                 }
-                dentry = filter_fid2dentry(obd, filter_parent(obd, oa->o_mode),
+                dentry = filter_fid2dentry(obd, filter_parent(obd, oa->o_mode,
+                                                              oa->o_id),
                                            oa->o_id, locked);
         }
 
         if (IS_ERR(dentry)) {
-                CERROR("%s error looking up object: "LPX64"\n", what, oa->o_id);
+                CERROR("%s error looking up object: "LPU64"\n", what, oa->o_id);
                 RETURN(dentry);
         }
 
         if (!dentry->d_inode) {
-                CERROR("%s on non-existent object: "LPX64"\n", what, oa->o_id);
+                CERROR("%s on non-existent object: "LPU64"\n", what, oa->o_id);
                 f_dput(dentry);
                 LBUG();
                 RETURN(ERR_PTR(-ENOENT));
@@ -1446,7 +1581,7 @@ static int filter_close(struct lustre_handle *conn, struct obdo *oa,
         XPROCFS_BUMP_MYCPU_IOSTAT (st_close_reqs, 1);
 
         if (!(oa->o_valid & OBD_MD_FLHANDLE)) {
-                CERROR("no handle for close of objid "LPX64"\n", oa->o_id);
+                CERROR("no handle for close of objid "LPU64"\n", oa->o_id);
                 RETURN(-EINVAL);
         }
 
@@ -1492,17 +1627,18 @@ static int filter_create(struct lustre_handle *conn, struct obdo *oa,
         oa->o_id = filter_next_id(obd);
 
         push_ctxt(&saved, &filter->fo_ctxt, NULL);
-        dir_dentry = filter_parent(obd, oa->o_mode);
+        dir_dentry = filter_parent(obd, S_IFREG, oa->o_id);
         down(&dir_dentry->d_inode->i_sem);
         new = filter_fid2dentry(obd, dir_dentry, oa->o_id, 0);
         if (IS_ERR(new))
                 GOTO(out, rc = PTR_ERR(new));
 
         if (new->d_inode) {
+                char buf[32];
+
                 /* This would only happen if lastobjid was bad on disk */
-                CERROR("objid O/%*s/"LPU64" already exists\n",
-                       dir_dentry->d_name.len, dir_dentry->d_name.name,
-                       oa->o_id);
+                CERROR("objid %s already exists\n",
+                       filter_id(buf, filter, S_IFREG, oa->o_id));
                 LBUG();
                 GOTO(out, rc = -EEXIST);
         }
@@ -1568,9 +1704,9 @@ static int filter_destroy(struct lustre_handle *conn, struct obdo *oa,
 
         XPROCFS_BUMP_MYCPU_IOSTAT (st_destroy_reqs, 1);
 
-        CDEBUG(D_INODE, "destroying objid "LPX64"\n", oa->o_id);
+        CDEBUG(D_INODE, "destroying objid "LPU64"\n", oa->o_id);
 
-        dir_dentry = filter_parent(obd, oa->o_mode);
+        dir_dentry = filter_parent(obd, oa->o_mode, oa->o_id);
         down(&dir_dentry->d_inode->i_sem);
 
         object_dentry = filter_oa2dentry(conn, oa, 0);
@@ -1591,11 +1727,11 @@ static int filter_destroy(struct lustre_handle *conn, struct obdo *oa,
                         fdd->fdd_flags |= FILTER_FLAG_DESTROY;
                         /* XXX put into PENDING directory in case of crash */
                         CDEBUG(D_INODE,
-                               "defer destroy of %dx open objid "LPX64"\n",
+                               "defer destroy of %dx open objid "LPU64"\n",
                                atomic_read(&fdd->fdd_open_count), oa->o_id);
                 } else
                         CDEBUG(D_INODE,
-                               "repeat destroy of %dx open objid "LPX64"\n",
+                               "repeat destroy of %dx open objid "LPU64"\n",
                                atomic_read(&fdd->fdd_open_count), oa->o_id);
                 GOTO(out_commit, rc = 0);
         }
@@ -1635,7 +1771,7 @@ static int filter_truncate(struct lustre_handle *conn, struct obdo *oa,
         if (end != OBD_OBJECT_EOF)
                 CERROR("PUNCH not supported, only truncate works\n");
 
-        CDEBUG(D_INODE, "calling truncate for object "LPX64", valid = %x, "
+        CDEBUG(D_INODE, "calling truncate for object "LPU64", valid = %x, "
                "o_size = "LPD64"\n", oa->o_id, oa->o_valid, start);
         oa->o_size = start;
         error = filter_setattr(conn, oa, NULL, oti);
@@ -1650,9 +1786,9 @@ static inline void lustre_put_page(struct page *page)
 
 
 static struct page *
-lustre_get_page_read(struct inode *inode, struct niobuf_remote *rnb)
+lustre_get_page_read(struct inode *inode, struct niobuf_local *lnb)
 {
-        unsigned long index = rnb->offset >> PAGE_SHIFT;
+        unsigned long index = lnb->offset >> PAGE_SHIFT;
         struct address_space *mapping = inode->i_mapping;
         struct page *page;
         int rc;
@@ -1661,7 +1797,8 @@ lustre_get_page_read(struct inode *inode, struct niobuf_remote *rnb)
                                (filler_t*)mapping->a_ops->readpage, NULL);
         if (!IS_ERR(page)) {
                 wait_on_page(page);
-                kmap(page);
+                lnb->addr = kmap(page);
+                lnb->page = page;
                 if (!PageUptodate(page)) {
                         CERROR("page index %lu not uptodate\n", index);
                         GOTO(err_page, rc = -EIO);
@@ -1760,12 +1897,10 @@ static int lustre_commit_write(struct niobuf_local *lnb)
 }
 
 struct page *filter_get_page_write(struct inode *inode,
-                                   struct niobuf_remote *rnb,
                                    struct niobuf_local *lnb, int *pglocked)
 {
-        unsigned long index = rnb->offset >> PAGE_SHIFT;
+        unsigned long index = lnb->offset >> PAGE_SHIFT;
         struct address_space *mapping = inode->i_mapping;
-
         struct page *page;
         int rc;
 
@@ -1791,26 +1926,25 @@ struct page *filter_get_page_write(struct inode *inode,
                 addr = __get_free_pages(GFP_KERNEL, 0); /* locked page */
                 if (!addr) {
                         CERROR("no memory for a temp page\n");
-                        LBUG();
                         GOTO(err, rc = -ENOMEM);
                 }
-                /* XXX debugging */
-                memset((void *)addr, 0xBA, PAGE_SIZE);
+                POISON((void *)addr, 0xBA, PAGE_SIZE);
                 page = virt_to_page(addr);
                 kmap(page);
                 page->index = index;
+                lnb->addr = (void *)addr;
+                lnb->page = page;
                 lnb->flags |= N_LOCAL_TEMP_PAGE;
         } else if (!IS_ERR(page)) {
                 (*pglocked)++;
                 kmap(page);
 
                 rc = mapping->a_ops->prepare_write(NULL, page,
-                                                   rnb->offset % PAGE_SIZE,
-                                                   rnb->len);
+                                                   lnb->offset & ~PAGE_MASK,
+                                                   lnb->len);
                 if (rc) {
-                        CERROR("page index %lu, rc = %d\n", index, rc);
                         if (rc != -ENOSPC)
-                                LBUG();
+                                CERROR("page index %lu, rc = %d\n", index, rc);
                         GOTO(err_unlock, rc);
                 }
                 /* XXX not sure if we need this if we are overwriting page */
@@ -1819,7 +1953,10 @@ struct page *filter_get_page_write(struct inode *inode,
                         LBUG();
                         GOTO(err_unlock, rc = -EIO);
                 }
+                lnb->addr = page_address(page);
+                lnb->page = page;
         }
+
         return page;
 
 err_unlock:
@@ -1849,7 +1986,8 @@ static int filter_commit_write(struct niobuf_local *lnb, int err)
                 unsigned blocksize = head->b_size;
 
                 /* debugging: just seeing if this ever happens */
-                CERROR("called filter_commit_write for ino %lu:%lu on err %d\n",
+                CDEBUG(err == -ENOSPC ? D_INODE : D_ERROR,
+                       "called for ino %lu:%lu on err %d\n",
                        lnb->page->mapping->host->i_ino, lnb->page->index, err);
 
                 /* Currently one buffer per page, but in the future... */
@@ -1876,7 +2014,6 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
         struct obd_ioobj *o;
         struct niobuf_remote *rnb = nb;
         struct niobuf_local *lnb = res;
-        struct dentry *dir_dentry;
         struct fsfilt_objinfo *fso;
         int pglocked = 0;
         int rc = 0;
@@ -1904,7 +2041,6 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
                 RETURN(-ENOMEM);
 
         push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL);
-        dir_dentry = filter_parent(obd, S_IFREG);
 
         for (i = 0, o = obj; i < objcount; i++, o++) {
                 struct filter_dentry_data *fdd;
@@ -1912,7 +2048,9 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
 
                 LASSERT(o->ioo_bufcnt);
 
-                dentry = filter_fid2dentry(obd, dir_dentry, o->ioo_id, 0);
+                dentry = filter_fid2dentry(obd, filter_parent(obd, S_IFREG,
+                                                              o->ioo_id),
+                                           o->ioo_id, 0);
 
                 if (IS_ERR(dentry))
                         GOTO(out_objinfo, rc = PTR_ERR(dentry));
@@ -1923,17 +2061,18 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
                 if (!dentry->d_inode) {
                         CERROR("trying to BRW to non-existent file "LPU64"\n",
                                o->ioo_id);
+                        f_dput(dentry);
                         GOTO(out_objinfo, rc = -ENOENT);
                 }
 
                 fdd = dentry->d_fsdata;
                 if (!fdd || !atomic_read(&fdd->fdd_open_count))
-                        CDEBUG(D_PAGE, "I/O to unopened object "LPX64"\n",
+                        CDEBUG(D_PAGE, "I/O to unopened object "LPU64"\n",
                                o->ioo_id);
         }
 
         if (cmd & OBD_BRW_WRITE) {
-#warning "FIXME: we need to get inode->i_sem for each object here"
+#warning "FIXME: we need inode->i_sem for each object to protect vs truncate"
                 /* Even worse, we need to get locks on mulitple inodes (in
                  * order) or use the DLM to do the locking for us (and use
                  * the same locking in filter_setattr() for truncate.  The
@@ -1951,8 +2090,13 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
                 filter_start_transno(export);
                 *desc_private = fsfilt_brw_start(obd, objcount, fso,
                                                  niocount, nb);
-                if (IS_ERR(*desc_private))
-                        GOTO(out_objinfo, rc = PTR_ERR(*desc_private));
+                if (IS_ERR(*desc_private)) {
+                        rc = PTR_ERR(*desc_private);
+                        CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
+                               "error starting transaction: rc = %d\n", rc);
+                        *desc_private = NULL;
+                        GOTO(out_objinfo, rc);
+                }
         }
 
         obd_kmap_get(niocount, 1);
@@ -1973,29 +2117,31 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
                         else
                                 lnb->dentry = dget(dentry);
 
+                        /* lnb->offset is aligned, while rnb->offset isn't,
+                         * and we need to copy the fields to lnb anyways.
+                         */
+                        memcpy(lnb, rnb, sizeof(*rnb));
                         if (cmd & OBD_BRW_WRITE) {
-                                page = filter_get_page_write(inode, rnb, lnb,
+                                page = filter_get_page_write(inode, lnb,
                                                              &pglocked);
 
-                                XPROCFS_BUMP_MYCPU_IOSTAT (st_write_bytes,
-                                                           rnb->len);
+                                XPROCFS_BUMP_MYCPU_IOSTAT(st_write_bytes,
+                                                          lnb->len);
                         } else {
-                                page = lustre_get_page_read(inode, rnb);
+                                page = lustre_get_page_read(inode, lnb);
 
-                                XPROCFS_BUMP_MYCPU_IOSTAT (st_read_bytes,
-                                                           rnb->len);
+                                XPROCFS_BUMP_MYCPU_IOSTAT(st_read_bytes,
+                                                          lnb->len);
                         }
 
                         if (IS_ERR(page)) {
                                 rc = PTR_ERR(page);
+                                CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
+                                       "error on page @"LPU64"%u/%u: rc = %d\n",
+                                       lnb->offset, j, o->ioo_bufcnt, rc);
                                 f_dput(dentry);
                                 GOTO(out_pages, rc);
                         }
-
-                        lnb->addr = page_address(page);
-                        lnb->offset = rnb->offset;
-                        lnb->page = page;
-                        lnb->len = rnb->len;
                 }
         }
 
@@ -2008,7 +2154,6 @@ out:
 
 out_pages:
         while (lnb-- > res) {
-                CERROR("%d error cleanup on brw\n", rc);
                 if (cmd & OBD_BRW_WRITE)
                         filter_commit_write(lnb, rc);
                 else
@@ -2016,16 +2161,17 @@ out_pages:
                 f_dput(lnb->dentry);
         }
         obd_kmap_put(niocount);
-        goto out_err; /* dropped the dentry refs already (one per page) */
+        if (cmd & OBD_BRW_WRITE) {
+                filter_finish_transno(export, *desc_private, oti, rc);
+                fsfilt_commit(obd,
+                              filter_parent(obd,S_IFREG,obj->ioo_id)->d_inode,
+                              *desc_private);
+        }
+        goto out; /* dropped the dentry refs already (one per page) */
 
 out_objinfo:
         for (i = 0; i < objcount && fso[i].fso_dentry; i++)
                 f_dput(fso[i].fso_dentry);
-out_err:
-        if (cmd & OBD_BRW_WRITE) {
-                filter_finish_transno(export, *desc_private, oti, rc);
-                fsfilt_commit(obd, dir_dentry->d_inode, *desc_private);
-        }
         goto out;
 }
 
@@ -2072,8 +2218,15 @@ static int filter_write_locked_page(struct niobuf_local *lnb)
         RETURN(rc);
 }
 
-static int filter_sync(struct obd_device *obd)
+static int filter_syncfs(struct lustre_handle *conn)
 {
+        struct obd_device *obd;
+        ENTRY;
+
+        obd = class_conn2obd(conn);
+
+        XPROCFS_BUMP_MYCPU_IOSTAT (st_syncfs_reqs, 1);
+
         RETURN(fsfilt_sync(obd, obd->u.filter.fo_sb));
 }
 
@@ -2139,8 +2292,9 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn,
         }
 
         if (cmd & OBD_BRW_WRITE) {
+                /* We just want any dentry for the commit, for now */
+                struct dentry *dir_dentry = filter_parent(obd, S_IFREG, 0);
                 int err;
-                struct dentry *dir_dentry = filter_parent(obd, S_IFREG);
 
                 rc = filter_finish_transno(export, desc_private, oti, rc);
                 err = fsfilt_commit(obd, dir_dentry->d_inode, desc_private);
@@ -2148,7 +2302,7 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn,
                         rc = err;
                 if (obd_sync_filter) {
                         /* this can fail with ENOMEM, what should we do then? */
-                        filter_sync(obd);
+                        filter_syncfs(conn);
                 }
                 /* XXX <adilger> LASSERT(last_rcvd == last_committed)*/
         }
@@ -2216,6 +2370,81 @@ out:
         RETURN(ret);
 }
 
+static int filter_san_preprw(int cmd, struct lustre_handle *conn,
+                             int objcount, struct obd_ioobj *obj,
+                             int niocount, struct niobuf_remote *nb)
+{
+        struct obd_device *obd;
+        struct obd_ioobj *o = obj;
+        struct niobuf_remote *rnb = nb;
+        int rc = 0;
+        int i;
+        ENTRY;
+
+        if ((cmd & OBD_BRW_WRITE) != 0)
+                XPROCFS_BUMP_MYCPU_IOSTAT (st_write_reqs, 1);
+        else
+                XPROCFS_BUMP_MYCPU_IOSTAT (st_read_reqs, 1);
+
+        obd = class_conn2obd(conn);
+        if (!obd) {
+                CDEBUG(D_IOCTL, "invalid client "LPX64"\n", conn->addr);
+                RETURN(-EINVAL);
+        }
+
+        for (i = 0; i < objcount; i++, o++) {
+                struct dentry *dentry;
+                struct inode *inode;
+                int j;
+
+                dentry = filter_fid2dentry(obd, filter_parent(obd, S_IFREG,
+                                                              o->ioo_id),
+                                           o->ioo_id, 0);
+                if (IS_ERR(dentry))
+                        GOTO(out, rc = PTR_ERR(dentry));
+                inode = dentry->d_inode;
+                if (!inode) {
+                        CERROR("trying to BRW to non-existent file "LPU64"\n",
+                               o->ioo_id);
+                        f_dput(dentry);
+                        GOTO(out, rc = -ENOENT);
+                }
+
+                for (j = 0; j < o->ioo_bufcnt; j++, rnb++) {
+                        long block;
+
+                        block = rnb->offset >> PAGE_SHIFT;
+
+                        if (cmd == OBD_BRW_READ) {
+                                block = inode->i_mapping->a_ops->bmap(
+                                                inode->i_mapping, block);
+                        } else {
+                                loff_t newsize = rnb->offset + rnb->len;
+                                /* fs_prep_san_write will also update inode
+                                 * size for us:
+                                 * (1) new alloced block
+                                 * (2) existed block but size extented
+                                 */
+                                /* FIXME We could call fs_prep_san_write()
+                                 * only once for all the blocks allocation.
+                                 * Now call it once for each block, for
+                                 * simplicity. And if error happens, we
+                                 * probably need to release previous alloced
+                                 * block */
+                                rc = fs_prep_san_write(obd, inode, &block,
+                                                       1, newsize);
+                                if (rc)
+                                        break;
+                        }
+
+                        rnb->offset = block;
+                }
+                f_dput(dentry);
+        }
+out:
+        RETURN(rc);
+}
+
 static int filter_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
 {
         struct obd_device *obd;
@@ -2358,6 +2587,7 @@ static struct obd_ops filter_obd_ops = {
         o_connect:      filter_connect,
         o_disconnect:   filter_disconnect,
         o_statfs:       filter_statfs,
+        o_syncfs:       filter_syncfs,
         o_getattr:      filter_getattr,
         o_create:       filter_create,
         o_setattr:      filter_setattr,
@@ -2369,6 +2599,36 @@ static struct obd_ops filter_obd_ops = {
         o_preprw:       filter_preprw,
         o_commitrw:     filter_commitrw
 #if 0
+        o_san_preprw:  filter_san_preprw,
+        o_preallocate: filter_preallocate_inodes,
+        o_migrate:     filter_migrate,
+        o_copy:        filter_copy_data,
+        o_iterate:     filter_iterate
+#endif
+};
+
+static struct obd_ops filter_sanobd_ops = {
+        o_owner:        THIS_MODULE,
+        o_attach:       filter_attach,
+        o_detach:       filter_detach,
+        o_get_info:     filter_get_info,
+        o_setup:        filter_san_setup,
+        o_cleanup:      filter_cleanup,
+        o_connect:      filter_connect,
+        o_disconnect:   filter_disconnect,
+        o_statfs:       filter_statfs,
+        o_getattr:      filter_getattr,
+        o_create:       filter_create,
+        o_setattr:      filter_setattr,
+        o_destroy:      filter_destroy,
+        o_open:         filter_open,
+        o_close:        filter_close,
+        o_brw:          filter_brw,
+        o_punch:        filter_truncate,
+        o_preprw:       filter_preprw,
+        o_commitrw:     filter_commitrw,
+        o_san_preprw:   filter_san_preprw,
+#if 0
         o_preallocate:  filter_preallocate_inodes,
         o_migrate:      filter_migrate,
         o_copy:         filter_copy_data,
@@ -2380,6 +2640,7 @@ static struct obd_ops filter_obd_ops = {
 static int __init obdfilter_init(void)
 {
         struct lprocfs_static_vars lvars;
+        int rc;
 
         printk(KERN_INFO "Lustre Filtering OBD driver; info@clusterfs.com\n");
         filter_open_cache = kmem_cache_create("ll_filter_fdata",
@@ -2392,19 +2653,37 @@ static int __init obdfilter_init(void)
                                         sizeof(struct filter_dentry_data),
                                         0, 0, NULL, NULL);
         if (!filter_dentry_cache) {
-                kmem_cache_destroy(filter_open_cache);
-                RETURN(-ENOMEM);
+                rc = -ENOMEM;
+                goto err1;
         }
 
         xprocfs_init ("filter");
 
         lprocfs_init_vars(&lvars);
-        return class_register_type(&filter_obd_ops, lvars.module_vars,
-                                   OBD_FILTER_DEVICENAME);
+
+        rc = class_register_type(&filter_obd_ops, lvars.module_vars,
+                                 OBD_FILTER_DEVICENAME);
+        if (rc)
+                goto err2;
+
+        rc = class_register_type(&filter_sanobd_ops, lvars.module_vars,
+                                 OBD_FILTER_SAN_DEVICENAME);
+        if (rc)
+                goto err3;
+
+        return 0;
+err3:
+        class_unregister_type(OBD_FILTER_DEVICENAME);
+err2:
+        kmem_cache_destroy(filter_dentry_cache);
+err1:
+        kmem_cache_destroy(filter_open_cache);
+        return rc;
 }
 
 static void __exit obdfilter_exit(void)
 {
+        class_unregister_type(OBD_FILTER_SAN_DEVICENAME);
         class_unregister_type(OBD_FILTER_DEVICENAME);
         if (kmem_cache_destroy(filter_dentry_cache))
                 CERROR("couldn't free obdfilter dentry cache\n");
index ad92f83..c4e0747 100644 (file)
  */
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
+#endif
 #include <linux/lprocfs_status.h>
 #include <linux/obd.h>
 
@@ -32,6 +36,7 @@ struct lprocfs_vars lprocfs_module_vars[] = { {0} };
 static inline int lprocfs_filter_statfs(void *data, struct statfs *sfs)
 {
         struct obd_device *dev = (struct obd_device *) data;
+        LASSERT(dev != NULL);
         return vfs_statfs(dev->u.filter.fo_sb, sfs);
 }
 
@@ -46,6 +51,7 @@ int rd_fstype(char *page, char **start, off_t off, int count, int *eof,
               void *data)
 {
         struct obd_device *dev = (struct obd_device *)data;
+        LASSERT(dev != NULL);
         return snprintf(page, count, "%s\n", dev->u.filter.fo_fstype);
 }
 
index 2348a5b..19fd65c 100644 (file)
@@ -5,12 +5,18 @@
 
 DEFS=
 
+
+if LIBLUSTRE
+lib_LIBRARIES = libosc.a
+LINX= obd_pack.c client.c
+libosc_a_SOURCES = osc_request.c  $(LINX)
+else
 MODULE = osc
 modulefs_DATA = osc.o
 EXTRA_PROGRAMS = osc
-
 LINX= obd_pack.c client.c
 osc_SOURCES = osc_request.c lproc_osc.c $(LINX)
+endif
 
 obd_pack.c: 
        test -e obd_pack.c || ln -sf $(top_srcdir)/lib/obd_pack.c
index 69af4bc..d5e4ec1 100644 (file)
  */
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
+#endif
 #include <linux/obd_class.h>
 #include <linux/lprocfs_status.h>
 
index 1abd150..ea205a6 100644 (file)
@@ -29,6 +29,7 @@
 #define EXPORT_SYMTAB
 #define DEBUG_SUBSYSTEM S_OSC
 
+#ifdef __KERNEL__
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/lustre_dlm.h>
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
 #include <linux/workqueue.h>
+#include <linux/smp_lock.h>
+#else
+#include <linux/locks.h>
 #endif
+#else
+#include <liblustre.h>
+#endif
+
 #include <linux/kp30.h>
 #include <linux/lustre_mds.h> /* for mds_objid */
 #include <linux/obd_ost.h>
 #include <portals/lib-types.h> /* for PTL_MD_MAX_IOV */
 #include <linux/lprocfs_status.h>
 
+/* It is important that ood_fh remain the first item in this structure: that
+ * way, we don't have to re-pack the obdo's inline data before we send it to
+ * the server, we can just send the whole struct unaltered. */
+#define OSC_OBDO_DATA_MAGIC 0xD15EA5ED
+struct osc_obdo_data {
+        struct lustre_handle ood_fh;
+        struct ptlrpc_request *ood_request;
+        __u32 ood_magic;
+};
+#include <linux/obd_lov.h> /* just for the startup assertion; is that wrong? */
+
+static int send_sync(struct obd_import *imp, struct ll_fid *rootfid,
+                          int level, int msg_flags)
+{
+        struct ptlrpc_request *req;
+        struct mds_body *body;
+        int rc, size = sizeof(*body);
+        ENTRY;
+
+        req = ptlrpc_prep_req(imp, OST_SYNCFS, 1, &size, NULL);
+        if (!req)
+                GOTO(out, rc = -ENOMEM);
+
+        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        req->rq_level = level;
+        req->rq_replen = lustre_msg_size(1, &size);
+
+        req->rq_reqmsg->flags |= msg_flags;
+        rc = ptlrpc_queue_wait(req);
+
+        if (!rc) {
+                CDEBUG(D_NET, "last_committed="LPU64
+                       ", last_xid="LPU64"\n",
+                       req->rq_repmsg->last_committed,
+                       req->rq_repmsg->last_xid);
+        }
+
+        EXIT;
+ out:
+        ptlrpc_req_finished(req);
+        return rc;
+}
+
+static int signal_completed_replay(struct obd_import *imp)
+{
+        struct ll_fid fid;
+
+        return send_sync(imp, &fid, LUSTRE_CONN_RECOVD, MSG_LAST_REPLAY);
+}
+
 static int osc_attach(struct obd_device *dev, obd_count len, void *data)
 {
         struct lprocfs_static_vars lvars;
@@ -123,7 +181,8 @@ static int osc_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
         RETURN(lsm_size);
 }
 
-inline void oti_from_request(struct obd_trans_info *oti, struct ptlrpc_request *req)
+inline void oti_from_request(struct obd_trans_info *oti,
+                             struct ptlrpc_request *req)
 {
         if (oti && req->rq_repmsg)
                 oti->oti_transno = NTOH__u64(req->rq_repmsg->transno);
@@ -178,7 +237,7 @@ static int osc_open(struct lustre_handle *conn, struct obdo *oa,
         if (!request)
                 RETURN(-ENOMEM);
 
-#warning FIXME: request->rq_flags |= PTL_RPC_FL_REPLAY;
+        request->rq_flags |= PTL_RPC_FL_REPLAY;
         body = lustre_msg_buf(request->rq_reqmsg, 0);
 #warning FIXME: pack only valid fields instead of memcpy, endianness
         memcpy(&body->oa, oa, sizeof(*oa));
@@ -189,11 +248,28 @@ static int osc_open(struct lustre_handle *conn, struct obdo *oa,
         if (rc)
                 GOTO(out, rc);
 
-        body = lustre_msg_buf(request->rq_repmsg, 0);
-        CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
-        if (oa)
+        if (oa) {
+                struct osc_obdo_data ood;
+                body = lustre_msg_buf(request->rq_repmsg, 0);
                 memcpy(oa, &body->oa, sizeof(*oa));
 
+                /* If the open succeeded, we better have a handle */
+                /* BlueArc OSTs don't send back (o_valid | FLHANDLE).  sigh.
+                 * Temporary workaround until fixed. -phil 24 Feb 03 */
+                //LASSERT(oa->o_valid & OBD_MD_FLHANDLE);
+                oa->o_valid |= OBD_MD_FLHANDLE;
+
+                memcpy(&ood.ood_fh, obdo_handle(oa), sizeof(ood.ood_fh));
+                ood.ood_request = ptlrpc_request_addref(request);
+                ood.ood_magic = OSC_OBDO_DATA_MAGIC;
+
+                /* Save this data in the request; it will be passed back to us
+                 * in future obdos.  This memcpy is guaranteed to be safe,
+                 * because we check at compile-time that sizeof(ood) is smaller
+                 * than oa->o_inline. */
+                memcpy(&oa->o_inline, &ood, sizeof(ood));
+        }
+
         EXIT;
  out:
         ptlrpc_req_finished(request);
@@ -203,13 +279,19 @@ static int osc_open(struct lustre_handle *conn, struct obdo *oa,
 static int osc_close(struct lustre_handle *conn, struct obdo *oa,
                      struct lov_stripe_md *md, struct obd_trans_info *oti)
 {
+        struct obd_import *import = class_conn2cliimp(conn);
         struct ptlrpc_request *request;
         struct ost_body *body;
+        struct osc_obdo_data *ood;
+        unsigned long flags;
         int rc, size = sizeof(*body);
         ENTRY;
 
-        request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_CLOSE, 1, &size,
-                                  NULL);
+        LASSERT(oa != NULL);
+        ood = (struct osc_obdo_data *)&oa->o_inline;
+        LASSERT(ood->ood_magic == OSC_OBDO_DATA_MAGIC);
+
+        request = ptlrpc_prep_req(import, OST_CLOSE, 1, &size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -220,13 +302,30 @@ static int osc_close(struct lustre_handle *conn, struct obdo *oa,
         request->rq_replen = lustre_msg_size(1, &size);
 
         rc = ptlrpc_queue_wait(request);
-        if (rc)
+        if (rc) {
+                /* FIXME: Does this mean that the file is still open locally?
+                 * If not, and I somehow suspect not, we need to cleanup
+                 * below */
                 GOTO(out, rc);
+        }
+
+        spin_lock_irqsave(&import->imp_lock, flags);
+        ood->ood_request->rq_flags &= ~PTL_RPC_FL_REPLAY;
+        /* see comments in llite/file.c:ll_mdc_close() */
+        if (ood->ood_request->rq_transno) {
+                LBUG(); /* this can't happen yet */
+                if (!request->rq_transno) {
+                        request->rq_transno = ood->ood_request->rq_transno;
+                        ptlrpc_retain_replayable_request(request, import);
+                }
+                spin_unlock_irqrestore(&import->imp_lock, flags);
+        } else {
+                spin_unlock_irqrestore(&import->imp_lock, flags);
+                ptlrpc_req_finished(ood->ood_request);
+        }
 
         body = lustre_msg_buf(request->rq_repmsg, 0);
-        CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
-        if (oa)
-                memcpy(oa, &body->oa, sizeof(*oa));
+        memcpy(oa, &body->oa, sizeof(*oa));
 
         EXIT;
  out:
@@ -401,7 +500,6 @@ static void unmap_and_decref_bulk_desc(void *data)
         struct list_head *tmp;
         ENTRY;
 
-        /* This feels wrong to me. */
         list_for_each(tmp, &desc->bd_page_list) {
                 struct ptlrpc_bulk_page *bulk;
                 bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
@@ -435,6 +533,27 @@ static void osc_ptl_ev_hdlr(struct ptlrpc_bulk_desc *desc)
         EXIT;
 }
 
+/*
+ * This is called when there was a bulk error return.  However, we don't know
+ * whether the bulk completed or not.  We cancel the portals bulk descriptors,
+ * so that if the OST decides to send them later we don't double free.  Then
+ * remove this descriptor from the set so that the set callback doesn't wait
+ * forever for the last CB_PHASE_FINISH to be called, and finally dump all of
+ * the bulk descriptor references.
+ */
+static void osc_ptl_ev_abort(struct ptlrpc_bulk_desc *desc)
+{
+        ENTRY;
+
+        LASSERT(desc->bd_brw_set != NULL);
+
+        ptlrpc_abort_bulk(desc);
+        obd_brw_set_del(desc);
+        unmap_and_decref_bulk_desc(desc);
+
+        EXIT;
+}
+
 static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                         obd_count page_count, struct brw_page *pga,
                         struct obd_brw_set *set)
@@ -445,12 +564,12 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         struct ptlrpc_bulk_desc *desc = NULL;
         struct ost_body *body;
         int rc, size[3] = {sizeof(*body)}, mapped = 0;
-        unsigned long flags;
         struct obd_ioobj *iooptr;
         void *nioptr;
         __u32 xid;
         ENTRY;
 
+restart_bulk:
         size[1] = sizeof(struct obd_ioobj);
         size[2] = page_count * sizeof(struct niobuf_remote);
 
@@ -459,6 +578,7 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                 RETURN(-ENOMEM);
 
         body = lustre_msg_buf(request->rq_reqmsg, 0);
+        body->oa.o_valid = HTON__u32(OBD_MD_FLCKSUM * CHECKSUM_BULK);
 
         desc = ptlrpc_prep_bulk(connection);
         if (!desc)
@@ -472,16 +592,16 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         ost_pack_ioo(&iooptr, lsm, page_count);
         /* end almost identical to brw_write case */
 
-        spin_lock_irqsave(&imp->imp_lock, flags);
-        xid = ++imp->imp_last_xid;       /* single xid for all pages */
-        spin_unlock_irqrestore(&imp->imp_lock, flags);
+        xid = ptlrpc_next_xid();       /* single xid for all pages */
 
         obd_kmap_get(page_count, 0);
 
         for (mapped = 0; mapped < page_count; mapped++) {
                 struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
-                if (bulk == NULL)
-                        GOTO(out_unmap, rc = -ENOMEM);
+                if (bulk == NULL) {
+                        unmap_and_decref_bulk_desc(desc);
+                        GOTO(out_req, rc = -ENOMEM);
+                }
 
                 bulk->bp_xid = xid;           /* single xid for all pages */
 
@@ -496,8 +616,8 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm,
          * Register the bulk first, because the reply could arrive out of order,
          * and we want to be ready for the bulk data.
          *
-         * One reference is released when brw_finish is complete, the other when
-         * the caller removes us from the "set" list.
+         * One reference is released when osc_ptl_ev_hdlr() is called by
+         * portals, the other when the caller removes us from the "set" list.
          *
          * On error, we never do the brw_finish, so we handle all decrefs.
          */
@@ -506,38 +626,70 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                        OBD_FAIL_OSC_BRW_READ_BULK);
         } else {
                 rc = ptlrpc_register_bulk_put(desc);
-                if (rc)
-                        GOTO(out_unmap, rc);
+                if (rc) {
+                        unmap_and_decref_bulk_desc(desc);
+                        GOTO(out_req, rc);
+                }
                 obd_brw_set_add(set, desc);
         }
 
+        request->rq_flags |= PTL_RPC_FL_NO_RESEND;
         request->rq_replen = lustre_msg_size(1, size);
         rc = ptlrpc_queue_wait(request);
 
-        /*
-         * XXX: If there is an error during the processing of the callback,
-         *      such as a timeout in a sleep that it performs, brw_finish
-         *      will never get called, and we'll leak the desc, fail to kunmap
-         *      things, cats will live with dogs.  One solution would be to
-         *      export brw_finish as osc_brw_finish, so that the timeout case
-         *      and its kin could call it for proper cleanup.  An alternative
-         *      would be for an error return from the callback to cause us to
-         *      clean up, but that doesn't help the truly async cases (like
-         *      LOV), which will immediately return from their PHASE_START
-         *      callback, before any such cleanup-requiring error condition can
-         *      be detected.
-         */
+        /* XXX bug 937 here */
+        if (rc == -ETIMEDOUT && (request->rq_flags & PTL_RPC_FL_RESEND)) {
+                DEBUG_REQ(D_HA, request,  "BULK TIMEOUT");
+                ptlrpc_req_finished(request);
+                goto restart_bulk;
+        }
+
+        if (rc) {
+                osc_ptl_ev_abort(desc);
+                GOTO(out_req, rc);
+        }
+
+#if CHECKSUM_BULK
+        body = lustre_msg_buf(request->rq_repmsg, 0);
+        if (body->oa.o_valid & NTOH__u32(OBD_MD_FLCKSUM)) {
+                static int cksum_counter;
+                __u64 server_cksum = NTOH__u64(body->oa.o_rdev);
+                __u64 cksum = 0;
+
+                for (mapped = 0; mapped < page_count; mapped++) {
+                        char *ptr = kmap(pga[mapped].pg);
+                        int   off = pga[mapped].off & (PAGE_SIZE - 1);
+                        int   len = pga[mapped].count;
+
+                        LASSERT(off + len <= PAGE_SIZE);
+                        ost_checksum(&cksum, ptr + off, len);
+                        kunmap(pga[mapped].pg);
+                }
+
+                cksum_counter++;
+                if (server_cksum != cksum) {
+                        CERROR("Bad checksum: server "LPX64", client "LPX64
+                               ", server NID "LPX64"\n", server_cksum, cksum,
+                               imp->imp_connection->c_peer.peer_nid);
+                        cksum_counter = 0;
+                } else if ((cksum_counter & (-cksum_counter)) == cksum_counter)
+                        CERROR("Checksum %u from "LPX64" OK: "LPX64"\n",
+                               cksum_counter,
+                               imp->imp_connection->c_peer.peer_nid, cksum);
+        } else {
+                static int cksum_missed;
+                cksum_missed++;
+                if ((cksum_missed & (-cksum_missed)) == cksum_missed)
+                        CERROR("Request checksum %u from "LPX64", no reply\n",
+                               cksum_missed,
+                               imp->imp_connection->c_peer.peer_nid);
+        }
+#endif
+
+        EXIT;
  out_req:
         ptlrpc_req_finished(request);
-        RETURN(rc);
-
-        /* Clean up on error. */
-out_unmap:
-        while (mapped-- > 0)
-                kunmap(pga[mapped].pg);
-        obd_kmap_put(page_count);
-        ptlrpc_bulk_decref(desc);
-        goto out_req;
+        return rc;
 }
 
 static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm,
@@ -550,12 +702,15 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         struct ptlrpc_bulk_desc *desc = NULL;
         struct ost_body *body;
         int rc, size[3] = {sizeof(*body)}, mapped = 0;
-        unsigned long flags;
         struct obd_ioobj *iooptr;
         void *nioptr;
         __u32 xid;
+#if CHECKSUM_BULK
+        __u64 cksum = 0;
+#endif
         ENTRY;
 
+restart_bulk:
         size[1] = sizeof(struct obd_ioobj);
         size[2] = page_count * sizeof(struct niobuf_remote);
 
@@ -577,26 +732,31 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         ost_pack_ioo(&iooptr, lsm, page_count);
         /* end almost identical to brw_read case */
 
-        spin_lock_irqsave(&imp->imp_lock, flags);
-        xid = ++imp->imp_last_xid;       /* single xid for all pages */
-        spin_unlock_irqrestore(&imp->imp_lock, flags);
+        xid = ptlrpc_next_xid();       /* single xid for all pages */
 
         obd_kmap_get(page_count, 0);
 
         for (mapped = 0; mapped < page_count; mapped++) {
                 struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
-                if (bulk == NULL)
-                        GOTO(out_unmap, rc = -ENOMEM);
+                if (bulk == NULL) {
+                        unmap_and_decref_bulk_desc(desc);
+                        GOTO(out_req, rc = -ENOMEM);
+                }
 
                 bulk->bp_xid = xid;           /* single xid for all pages */
 
                 bulk->bp_buf = kmap(pga[mapped].pg);
                 bulk->bp_page = pga[mapped].pg;
-                bulk->bp_buflen = PAGE_SIZE;
+                bulk->bp_buflen = pga[mapped].count;
                 ost_pack_niobuf(&nioptr, pga[mapped].off, pga[mapped].count,
                                 pga[mapped].flag, bulk->bp_xid);
+                ost_checksum(&cksum, bulk->bp_buf, bulk->bp_buflen);
         }
 
+#if CHECKSUM_BULK
+        body->oa.o_rdev = HTON__u64(cksum);
+        body->oa.o_valid |= HTON__u32(OBD_MD_FLCKSUM);
+#endif
         /*
          * Register the bulk first, because the reply could arrive out of
          * order, and we want to be ready for the bulk data.
@@ -608,47 +768,363 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm,
          */
         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_WRITE_BULK)) {
                 CERROR("obd_fail_loc=%x, skipping register_bulk\n",
-                OBD_FAIL_OSC_BRW_WRITE_BULK);
+                       OBD_FAIL_OSC_BRW_WRITE_BULK);
         } else {
                 rc = ptlrpc_register_bulk_get(desc);
-                if (rc)
-                        GOTO(out_unmap, rc);
+                if (rc) {
+                        unmap_and_decref_bulk_desc(desc);
+                        GOTO(out_req, rc);
+                }
                 obd_brw_set_add(set, desc);
         }
 
+        request->rq_flags |= PTL_RPC_FL_NO_RESEND;
         request->rq_replen = lustre_msg_size(1, size);
         rc = ptlrpc_queue_wait(request);
 
-        /*
-         * XXX: If there is an error during the processing of the callback,
-         *      such as a timeout in a sleep that it performs, brw_finish
-         *      will never get called, and we'll leak the desc, fail to kunmap
-         *      things, cats will live with dogs.  One solution would be to
-         *      export brw_finish as osc_brw_finish, so that the timeout case
-         *      and its kin could call it for proper cleanup.  An alternative
-         *      would be for an error return from the callback to cause us to
-         *      clean up, but that doesn't help the truly async cases (like
-         *      LOV), which will immediately return from their PHASE_START
-         *      callback, before any such cleanup-requiring error condition can
-         *      be detected.
-         */
+        /* XXX bug 937 here */
+        if (rc == -ETIMEDOUT && (request->rq_flags & PTL_RPC_FL_RESEND)) {
+                DEBUG_REQ(D_HA, request,  "BULK TIMEOUT");
+                ptlrpc_req_finished(request);
+                goto restart_bulk;
+        }
+
+        if (rc) {
+                osc_ptl_ev_abort(desc);
+                GOTO(out_req, rc);
+        }
+
+        EXIT;
  out_req:
         ptlrpc_req_finished(request);
+        return rc;
+}
+
+#ifndef min_t
+#define min_t(a,b,c) ( b<c ) ? b : c
+#endif
+
+#warning "FIXME: make values dynamic based on get_info at setup (bug 665)"
+#define OSC_BRW_MAX_SIZE 65536
+#define OSC_BRW_MAX_IOV min_t(int, PTL_MD_MAX_IOV, OSC_BRW_MAX_SIZE/PAGE_SIZE)
+
+static int osc_brw(int cmd, struct lustre_handle *conn,
+                   struct lov_stripe_md *md, obd_count page_count,
+                   struct brw_page *pga, struct obd_brw_set *set,
+                   struct obd_trans_info *oti)
+{
+        ENTRY;
+
+        while (page_count) {
+                obd_count pages_per_brw;
+                int rc;
+
+                if (page_count > OSC_BRW_MAX_IOV)
+                        pages_per_brw = OSC_BRW_MAX_IOV;
+                else
+                        pages_per_brw = page_count;
+
+                if (cmd & OBD_BRW_WRITE)
+                        rc = osc_brw_write(conn, md, pages_per_brw, pga,
+                                           set, oti);
+                else
+                        rc = osc_brw_read(conn, md, pages_per_brw, pga, set);
+
+                if (rc != 0)
+                        RETURN(rc);
+
+                page_count -= pages_per_brw;
+                pga += pages_per_brw;
+        }
+        RETURN(0);
+}
+
+#ifdef __KERNEL__
+/* Note: caller will lock/unlock, and set uptodate on the pages */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+static int sanosc_brw_read(struct lustre_handle *conn,
+                           struct lov_stripe_md *md,
+                           obd_count page_count,
+                           struct brw_page *pga,
+                           struct obd_brw_set *set)
+{
+        struct ptlrpc_request *request = NULL;
+        struct ost_body *body;
+        struct niobuf_remote *remote, *nio_rep;
+        int rc, j, size[3] = {sizeof(*body)}, mapped = 0;
+        struct obd_ioobj *iooptr;
+        void *nioptr;
+        ENTRY;
+
+        size[1] = sizeof(struct obd_ioobj);
+        size[2] = page_count * sizeof(*remote);
+
+        request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_SAN_READ, 3,
+                                  size, NULL);
+        if (!request)
+                RETURN(-ENOMEM);
+
+        body = lustre_msg_buf(request->rq_reqmsg, 0);
+        iooptr = lustre_msg_buf(request->rq_reqmsg, 1);
+        nioptr = lustre_msg_buf(request->rq_reqmsg, 2);
+        ost_pack_ioo(&iooptr, md, page_count);
+
+        obd_kmap_get(page_count, 0);
+
+        for (mapped = 0; mapped < page_count; mapped++) {
+                LASSERT(PageLocked(pga[mapped].pg));
+
+                kmap(pga[mapped].pg);
+                ost_pack_niobuf(&nioptr, pga[mapped].off, pga[mapped].count,
+                                pga[mapped].flag, 0);
+        }
+
+        size[1] = page_count * sizeof(*remote);
+        request->rq_replen = lustre_msg_size(2, size);
+
+        rc = ptlrpc_queue_wait(request);
+        if (rc)
+                GOTO(out_unmap, rc);
+
+        nioptr = lustre_msg_buf(request->rq_repmsg, 1);
+        if (!nioptr)
+                GOTO(out_unmap, rc = -EINVAL);
+
+        if (request->rq_repmsg->buflens[1] != size[1]) {
+                CERROR("buffer length wrong (%d vs. %d)\n",
+                       request->rq_repmsg->buflens[1], size[1]);
+                GOTO(out_unmap, rc = -EINVAL);
+        }
+
+        for (j = 0; j < page_count; j++) {
+                ost_unpack_niobuf(&nioptr, &remote);
+        }
+
+        nioptr = lustre_msg_buf(request->rq_repmsg, 1);
+        nio_rep = (struct niobuf_remote*)nioptr;
+
+        /* actual read */
+        for (j = 0; j < page_count; j++) {
+                struct page *page = pga[j].pg;
+                struct buffer_head *bh;
+                kdev_t dev;
+
+                /* got san device associated */
+                LASSERT(class_conn2obd(conn));
+                dev = class_conn2obd(conn)->u.cli.cl_sandev;
+
+                /* hole */
+                if (!nio_rep[j].offset) {
+                        CDEBUG(D_PAGE, "hole at ino %lu; index %ld\n",
+                                        page->mapping->host->i_ino,
+                                        page->index);
+                        memset(page_address(page), 0, PAGE_SIZE);
+                        continue;
+                }
+
+                if (!page->buffers) {
+                        create_empty_buffers(page, dev, PAGE_SIZE);
+                        bh = page->buffers;
+
+                        clear_bit(BH_New, &bh->b_state);
+                        set_bit(BH_Mapped, &bh->b_state);
+                        bh->b_blocknr = (unsigned long)nio_rep[j].offset;
+
+                        clear_bit(BH_Uptodate, &bh->b_state);
+
+                        ll_rw_block(READ, 1, &bh);
+                } else {
+                        bh = page->buffers;
+
+                        /* if buffer already existed, it must be the
+                         * one we mapped before, check it */
+                        LASSERT(!test_bit(BH_New, &bh->b_state));
+                        LASSERT(test_bit(BH_Mapped, &bh->b_state));
+                        LASSERT(bh->b_blocknr ==
+                                (unsigned long)nio_rep[j].offset);
+
+                        /* wait it's io completion */
+                        if (test_bit(BH_Lock, &bh->b_state))
+                                wait_on_buffer(bh);
+
+                        if (!test_bit(BH_Uptodate, &bh->b_state))
+                                ll_rw_block(READ, 1, &bh);
+                }
+
+
+                /* must do syncronous write here */
+                wait_on_buffer(bh);
+                if (!buffer_uptodate(bh)) {
+                        /* I/O error */
+                        rc = -EIO;
+                        goto out_unmap;
+                }
+        }
+
+out_req:
+        ptlrpc_req_finished(request);
         RETURN(rc);
 
+out_unmap:
         /* Clean up on error. */
+        while (mapped-- > 0)
+                kunmap(pga[mapped].pg);
+
+        obd_kmap_put(page_count);
+
+        goto out_req;
+}
+
+static int sanosc_brw_write(struct lustre_handle *conn,
+                            struct lov_stripe_md *md,
+                            obd_count page_count,
+                            struct brw_page *pga,
+                            struct obd_brw_set *set)
+{
+        struct ptlrpc_request *request = NULL;
+        struct ost_body *body;
+        struct niobuf_remote *remote, *nio_rep;
+        int rc, j, size[3] = {sizeof(*body)}, mapped = 0;
+        struct obd_ioobj *iooptr;
+        void *nioptr;
+        ENTRY;
+
+        size[1] = sizeof(struct obd_ioobj);
+        size[2] = page_count * sizeof(*remote);
+
+        request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_SAN_WRITE,
+                                  3, size, NULL);
+        if (!request)
+                RETURN(-ENOMEM);
+
+        body = lustre_msg_buf(request->rq_reqmsg, 0);
+        iooptr = lustre_msg_buf(request->rq_reqmsg, 1);
+        nioptr = lustre_msg_buf(request->rq_reqmsg, 2);
+        ost_pack_ioo(&iooptr, md, page_count);
+
+        /* map pages, and pack request */
+        obd_kmap_get(page_count, 0);
+        for (mapped = 0; mapped < page_count; mapped++) {
+                LASSERT(PageLocked(pga[mapped].pg));
+
+                kmap(pga[mapped].pg);
+                ost_pack_niobuf(&nioptr, pga[mapped].off, pga[mapped].count,
+                                pga[mapped].flag, 0);
+        }
+
+        size[1] = page_count * sizeof(*remote);
+        request->rq_replen = lustre_msg_size(2, size);
+
+        rc = ptlrpc_queue_wait(request);
+        if (rc)
+                GOTO(out_unmap, rc);
+
+        nioptr = lustre_msg_buf(request->rq_repmsg, 1);
+        if (!nioptr)
+                GOTO(out_unmap, rc = -EINVAL);
+
+        if (request->rq_repmsg->buflens[1] != size[1]) {
+                CERROR("buffer length wrong (%d vs. %d)\n",
+                       request->rq_repmsg->buflens[1], size[1]);
+                GOTO(out_unmap, rc = -EINVAL);
+        }
+
+        for (j = 0; j < page_count; j++) {
+                ost_unpack_niobuf(&nioptr, &remote);
+        }
+
+        nioptr = lustre_msg_buf(request->rq_repmsg, 1);
+        nio_rep = (struct niobuf_remote*)nioptr;
+
+        /* actual write */
+        for (j = 0; j < page_count; j++) {
+                struct page *page = pga[j].pg;
+                struct buffer_head *bh;
+                kdev_t dev;
+
+                /* got san device associated */
+                LASSERT(class_conn2obd(conn));
+                dev = class_conn2obd(conn)->u.cli.cl_sandev;
+
+                if (!page->buffers) {
+                        create_empty_buffers(page, dev, PAGE_SIZE);
+                } else {
+                        /* checking */
+                        LASSERT(!test_bit(BH_New, &page->buffers->b_state));
+                        LASSERT(test_bit(BH_Mapped, &page->buffers->b_state));
+                        LASSERT(page->buffers->b_blocknr ==
+                                (unsigned long)nio_rep[j].offset);
+                }
+                bh = page->buffers;
+
+                LASSERT(bh);
+
+                /* if buffer locked, wait it's io completion */
+                if (test_bit(BH_Lock, &bh->b_state))
+                        wait_on_buffer(bh);
+
+                clear_bit(BH_New, &bh->b_state);
+                set_bit(BH_Mapped, &bh->b_state);
+
+                /* override the block nr */
+                bh->b_blocknr = (unsigned long)nio_rep[j].offset;
+
+                /* we are about to write it, so set it
+                 * uptodate/dirty
+                 * page lock should garentee no race condition here */
+                set_bit(BH_Uptodate, &bh->b_state);
+                set_bit(BH_Dirty, &bh->b_state);
+
+                ll_rw_block(WRITE, 1, &bh);
+
+                /* must do syncronous write here */
+                wait_on_buffer(bh);
+                if (!buffer_uptodate(bh) || test_bit(BH_Dirty, &bh->b_state)) {
+                        /* I/O error */
+                        rc = -EIO;
+                        goto out_unmap;
+                }
+        }
+
+out_req:
+        ptlrpc_req_finished(request);
+        RETURN(rc);
+
 out_unmap:
+        /* Clean up on error. */
         while (mapped-- > 0)
                 kunmap(pga[mapped].pg);
+
         obd_kmap_put(page_count);
-        ptlrpc_bulk_decref(desc);
+
         goto out_req;
 }
+#else
+static int sanosc_brw_read(struct lustre_handle *conn,
+                           struct lov_stripe_md *md,
+                           obd_count page_count,
+                           struct brw_page *pga,
+                           struct obd_brw_set *set)
+{
+        LBUG();
+        return 0;
+}
 
-static int osc_brw(int cmd, struct lustre_handle *conn,
-                   struct lov_stripe_md *md, obd_count page_count,
-                   struct brw_page *pga, struct obd_brw_set *set, 
-                   struct obd_trans_info *oti)
+static int sanosc_brw_write(struct lustre_handle *conn,
+                            struct lov_stripe_md *md,
+                            obd_count page_count,
+                            struct brw_page *pga,
+                            struct obd_brw_set *set)
+{
+        LBUG();
+        return 0;
+}
+#endif
+
+static int sanosc_brw(int cmd, struct lustre_handle *conn,
+                      struct lov_stripe_md *md, obd_count page_count,
+                      struct brw_page *pga, struct obd_brw_set *set,
+                      struct obd_trans_info *oti)
 {
         ENTRY;
 
@@ -656,15 +1132,16 @@ static int osc_brw(int cmd, struct lustre_handle *conn,
                 obd_count pages_per_brw;
                 int rc;
 
-                if (page_count > PTL_MD_MAX_IOV)
-                        pages_per_brw = PTL_MD_MAX_IOV;
+                if (page_count > OSC_BRW_MAX_IOV)
+                        pages_per_brw = OSC_BRW_MAX_IOV;
                 else
                         pages_per_brw = page_count;
 
                 if (cmd & OBD_BRW_WRITE)
-                        rc = osc_brw_write(conn, md, pages_per_brw, pga, set, oti);
+                        rc = sanosc_brw_write(conn, md, pages_per_brw,
+                                              pga, set);
                 else
-                        rc = osc_brw_read(conn, md, pages_per_brw, pga, set);
+                        rc = sanosc_brw_read(conn, md, pages_per_brw, pga, set);
 
                 if (rc != 0)
                         RETURN(rc);
@@ -674,6 +1151,7 @@ static int osc_brw(int cmd, struct lustre_handle *conn,
         }
         RETURN(0);
 }
+#endif
 
 static int osc_enqueue(struct lustre_handle *connh, struct lov_stripe_md *lsm,
                        struct lustre_handle *parent_lock,
@@ -906,7 +1384,7 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
                         GOTO(out, err = -EINVAL);
                 }
 
-                if (data->ioc_inllen2 < sizeof(uuid.uuid)) {
+                if (data->ioc_inllen2 < sizeof(uuid)) {
                         OBD_FREE(buf, len);
                         GOTO(out, err = -EINVAL);
                 }
@@ -918,10 +1396,9 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
                 desc->ld_default_stripe_size = 0;
                 desc->ld_default_stripe_offset = 0;
                 desc->ld_pattern = 0;
-                memcpy(desc->ld_uuid.uuid,  obddev->obd_uuid.uuid, sizeof(uuid.uuid));
+                memcpy(&desc->ld_uuid, &obddev->obd_uuid, sizeof(uuid));
 
-                memcpy(data->ioc_inlbuf2,  obddev->obd_uuid.uuid, 
-                       sizeof(uuid.uuid));
+                memcpy(data->ioc_inlbuf2, &obddev->obd_uuid, sizeof(uuid));
 
                 err = copy_to_user((void *)uarg, buf, len);
                 if (err)
@@ -967,15 +1444,15 @@ static void set_osc_active(struct obd_import *imp, int active)
 
                 fakeconn.addr = (__u64)(unsigned long)exp;
                 fakeconn.cookie = exp->exp_cookie;
-                ioc_data.ioc_inlbuf1 = &imp->imp_obd->u.cli.cl_target_uuid;
+                ioc_data.ioc_inlbuf1 =
+                        (char *)&imp->imp_obd->u.cli.cl_target_uuid;
                 ioc_data.ioc_offset = active;
                 rc = obd_iocontrol(IOC_LOV_SET_OSC_ACTIVE, &fakeconn,
                                    sizeof ioc_data, &ioc_data, NULL);
-                if (rc) {
-                        CERROR("disabling %s on LOV %p/%s: %d\n",
+                if (rc)
+                        CERROR("error disabling %s on LOV %p/%s: %d\n",
                                imp->imp_obd->u.cli.cl_target_uuid.uuid,
                                notify_obd, notify_obd->obd_uuid.uuid, rc);
-                }
         } else {
                 CDEBUG(D_HA, "No exports for obd %p/%s, can't notify about "
                        "%p\n", notify_obd, notify_obd->obd_uuid.uuid,
@@ -987,30 +1464,86 @@ static int osc_recover(struct obd_import *imp, int phase)
 {
         int rc;
         unsigned long flags;
+        int msg_flags;
         struct ptlrpc_request *req;
+        struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
         ENTRY;
 
+        CDEBUG(D_HA, "%s: entering phase: %d\n",
+               imp->imp_obd->obd_name, phase);
         switch(phase) {
 
             case PTLRPC_RECOVD_PHASE_PREPARE: {
-                struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
-                ldlm_namespace_cleanup(ns, 1 /* no network ops */);
-                ptlrpc_abort_inflight(imp, 0);
-                set_osc_active(imp, 0 /* inactive */);
+                if (imp->imp_flags & IMP_REPLAYABLE) {
+                        CDEBUG(D_HA, "failover OST\n");
+                        /* If we're a failover OSC/OST, just cancel unused
+                         * locks to simplify lock replay.
+                         */
+                        ldlm_cli_cancel_unused(ns, NULL, LDLM_FL_LOCAL_ONLY);
+                } else {
+                        CDEBUG(D_HA, "non-failover OST\n");
+                        /* Non-failover OSTs (LLNL scenario) disable the OSC
+                         * and invalidate local state.
+                         */
+                        ldlm_namespace_cleanup(ns, 1 /* no network ops */);
+                        ptlrpc_abort_inflight(imp, 0);
+                        set_osc_active(imp, 0 /* inactive */);
+                }
                 RETURN(0);
             }
 
-            case PTLRPC_RECOVD_PHASE_RECOVER:
+        case PTLRPC_RECOVD_PHASE_RECOVER: {
+        reconnect:
                 imp->imp_flags &= ~IMP_INVALID;
                 rc = ptlrpc_reconnect_import(imp, OST_CONNECT, &req);
-                ptlrpc_req_finished(req);
+
+                msg_flags = req->rq_repmsg
+                        ? lustre_msg_get_op_flags(req->rq_repmsg)
+                        : 0;
+
+                if (rc == -EBUSY && (msg_flags & MSG_CONNECT_RECOVERING))
+                        CERROR("reconnect denied by recovery; should retry\n");
+
                 if (rc) {
+                        if (phase != PTLRPC_RECOVD_PHASE_NOTCONN) {
+                                CERROR("can't reconnect, invalidating\n");
+                                ldlm_namespace_cleanup(ns, 1);
+                                ptlrpc_abort_inflight(imp, 0);
+                        }
                         imp->imp_flags |= IMP_INVALID;
+                        ptlrpc_req_finished(req);
                         RETURN(rc);
                 }
 
+                if (msg_flags & MSG_CONNECT_RECOVERING) {
+                        /* Replay if they want it. */
+                        DEBUG_REQ(D_HA, req, "OST wants replay");
+                        rc = ptlrpc_replay(imp);
+                        if (rc)
+                                GOTO(check_rc, rc);
+
+                        rc = ldlm_replay_locks(imp);
+                        if (rc)
+                                GOTO(check_rc, rc);
+
+                        rc = signal_completed_replay(imp);
+                        if (rc)
+                                GOTO(check_rc, rc);
+                } else if (msg_flags & MSG_CONNECT_RECONNECT) {
+                        DEBUG_REQ(D_HA, req, "reconnecting to MDS\n");
+                        /* Nothing else to do here. */
+                } else {
+                        DEBUG_REQ(D_HA, req, "evicted: invalidating\n");
+                        /* Otherwise, clean everything up. */
+                        ldlm_namespace_cleanup(ns, 1);
+                        ptlrpc_abort_inflight(imp, 0);
+                }
+
+                ptlrpc_req_finished(req);
+
                 spin_lock_irqsave(&imp->imp_lock, flags);
                 imp->imp_level = LUSTRE_CONN_FULL;
+                imp->imp_flags &= ~IMP_INVALID;
                 spin_unlock_irqrestore(&imp->imp_lock, flags);
 
                 /* Is this the right place?  Should we do this in _PREPARE
@@ -1018,9 +1551,21 @@ static int osc_recover(struct obd_import *imp, int phase)
                  */
                 ptlrpc_wake_delayed(imp);
 
+                rc = ptlrpc_resend(imp);
+                if (rc)
+                        GOTO(check_rc, rc);
+
                 set_osc_active(imp, 1 /* active */);
                 RETURN(0);
 
+        check_rc:
+                /* If we get disconnected in the middle, recovery has probably
+                 * failed.  Reconnect and find out.
+                 */
+                if (rc == -ENOTCONN)
+                        goto reconnect;
+                RETURN(rc);
+        }
             case PTLRPC_RECOVD_PHASE_NOTCONN:
                 osc_recover(imp, PTLRPC_RECOVD_PHASE_PREPARE);
                 RETURN(osc_recover(imp, PTLRPC_RECOVD_PHASE_RECOVER));
@@ -1064,23 +1609,67 @@ struct obd_ops osc_obd_ops = {
         o_iocontrol:    osc_iocontrol
 };
 
-static int __init osc_init(void)
+struct obd_ops sanosc_obd_ops = {
+        o_owner:        THIS_MODULE,
+        o_attach:       osc_attach,
+        o_detach:       osc_detach,
+        o_cleanup:      client_obd_cleanup,
+        o_connect:      osc_connect,
+        o_disconnect:   client_obd_disconnect,
+        o_statfs:       osc_statfs,
+        o_packmd:       osc_packmd,
+        o_unpackmd:     osc_unpackmd,
+        o_create:       osc_create,
+        o_destroy:      osc_destroy,
+        o_getattr:      osc_getattr,
+        o_setattr:      osc_setattr,
+        o_open:         osc_open,
+        o_close:        osc_close,
+#ifdef __KERNEL__
+        o_setup:        client_sanobd_setup,
+        o_brw:          sanosc_brw,
+#endif
+        o_punch:        osc_punch,
+        o_enqueue:      osc_enqueue,
+        o_cancel:       osc_cancel,
+        o_cancel_unused: osc_cancel_unused,
+        o_iocontrol:    osc_iocontrol,
+};
+
+int __init osc_init(void)
 {
         struct lprocfs_static_vars lvars;
+        int rc;
+        ENTRY;
+
+        LASSERT(sizeof(struct osc_obdo_data) <= FD_OSTDATA_SIZE);
 
         lprocfs_init_vars(&lvars);
-        RETURN(class_register_type(&osc_obd_ops, lvars.module_vars,
-                                   LUSTRE_OSC_NAME));
+
+        rc = class_register_type(&osc_obd_ops, lvars.module_vars,
+                                 LUSTRE_OSC_NAME);
+        if (rc)
+                RETURN(rc);
+
+        rc = class_register_type(&sanosc_obd_ops, lvars.module_vars,
+                                 LUSTRE_SANOSC_NAME);
+        if (rc)
+                class_unregister_type(LUSTRE_OSC_NAME);
+
+        RETURN(rc);
 }
 
 static void __exit osc_exit(void)
 {
+        class_unregister_type(LUSTRE_SANOSC_NAME);
         class_unregister_type(LUSTRE_OSC_NAME);
 }
 
+#ifdef __KERNEL__
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
 MODULE_LICENSE("GPL");
 
 module_init(osc_init);
 module_exit(osc_exit);
+#endif
index d595757..18a1b85 100644 (file)
 #include <linux/init.h>
 #include <linux/lprocfs_status.h>
 
+inline void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req)
+{
+        if (oti && req->rq_repmsg)
+                req->rq_repmsg->transno = HTON__u64(oti->oti_transno);
+        EXIT;
+}
 
 static int ost_destroy(struct ptlrpc_request *req, struct obd_trans_info *oti)
 {
@@ -104,6 +110,27 @@ static int ost_statfs(struct ptlrpc_request *req)
         RETURN(0);
 }
 
+static int ost_syncfs(struct ptlrpc_request *req)
+{
+        struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
+        struct obd_statfs *osfs;
+        int rc, size = sizeof(*osfs);
+        ENTRY;
+
+        rc = lustre_pack_msg(0, &size, NULL, &req->rq_replen, &req->rq_repmsg);
+        if (rc)
+                RETURN(rc);
+
+        rc = obd_syncfs(conn);
+        if (rc) {
+                CERROR("ost: syncfs failed: rc %d\n", rc);
+                req->rq_status = rc;
+                RETURN(rc);
+        }
+
+        RETURN(0);
+}
+
 static int ost_open(struct ptlrpc_request *req, struct obd_trans_info *oti)
 {
         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
@@ -232,6 +259,9 @@ static int ost_brw_read(struct ptlrpc_request *req)
         void *desc_priv = NULL;
         int cmd, i, j, objcount, niocount, size = sizeof(*body);
         int rc = 0;
+#if CHECKSUM_BULK
+        __u64 cksum = 0;
+#endif
         ENTRY;
 
         body = lustre_msg_buf(req->rq_reqmsg, 0);
@@ -245,6 +275,13 @@ static int ost_brw_read(struct ptlrpc_request *req)
         if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK))
                 GOTO(out, req->rq_status = -EIO);
 
+        /* Hmm, we don't return anything in this reply buffer?
+         * We should be returning per-page status codes and also
+         * per-object size, blocks count, mtime, ctime.  (bug 593) */
+        rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
+        if (rc)
+                GOTO(out, req->rq_status = rc);
+
         for (i = 0; i < objcount; i++) {
                 ost_unpack_ioo(&tmp1, &ioo);
                 if (tmp2 + ioo->ioo_bufcnt > end2) {
@@ -284,6 +321,8 @@ static int ost_brw_read(struct ptlrpc_request *req)
                 bulk->bp_xid = remote_nb[i].xid;
                 bulk->bp_buf = local_nb[i].addr;
                 bulk->bp_buflen = remote_nb[i].len;
+                if (body->oa.o_valid & NTOH__u32(OBD_MD_FLCKSUM))
+                        ost_checksum(&cksum, bulk->bp_buf, bulk->bp_buflen);
         }
 
         rc = ptlrpc_bulk_put(desc);
@@ -306,16 +345,17 @@ out_bulk:
 out_local:
         OBD_FREE(local_nb, sizeof(*local_nb) * niocount);
 out:
-        if (!rc)
-                /* Hmm, we don't return anything in this reply buffer?
-                 * We should be returning per-page status codes and also
-                 * per-object size, blocks count, mtime, ctime.  (bug 593) */
-                rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen,
-                                     &req->rq_repmsg);
         if (rc)
                 ptlrpc_error(req->rq_svc, req);
-        else
+        else {
+#if CHECKSUM_BULK
+                body = lustre_msg_buf(req->rq_repmsg, 0);
+                body->oa.o_rdev = HTON__u64(cksum);
+                body->oa.o_valid |= HTON__u32(OBD_MD_FLCKSUM);
+#endif
                 ptlrpc_reply(req->rq_svc, req);
+        }
+
         RETURN(rc);
 }
 
@@ -358,7 +398,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 }
         }
 
-        OBD_ALLOC(local_nb, sizeof(*local_nb)* niocount);
+        OBD_ALLOC(local_nb, sizeof(*local_nb) * niocount);
         if (local_nb == NULL)
                 GOTO(out, rc = -ENOMEM);
 
@@ -369,7 +409,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                     remote_nb, local_nb, &desc_priv, oti);
 
         if (req->rq_status)
-                GOTO(out, rc = 0);
+                GOTO(out_local, rc = 0);
 
         desc = ptlrpc_prep_bulk(req->rq_connection);
         if (desc == NULL)
@@ -403,6 +443,38 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 GOTO(out_bulk, rc);
         }
 
+#if CHECKSUM_BULK
+        if ((body->oa.o_valid & NTOH__u32(OBD_MD_FLCKSUM))) {
+                static int cksum_counter;
+                __u64 client_cksum = NTOH__u64(body->oa.o_rdev);
+                __u64 cksum = 0;
+
+                for (i = 0; i < niocount; i++) {
+                        char *ptr = kmap(local_nb[i].page);
+                        int   off = local_nb[i].offset & (PAGE_SIZE - 1);
+                        int   len = local_nb[i].len;
+
+                        LASSERT(off + len <= PAGE_SIZE);
+                        ost_checksum(&cksum, ptr + off, len);
+                        kunmap(local_nb[i].page);
+                }
+
+                if (client_cksum != cksum) {
+                        CERROR("Bad checksum: client "LPX64", server "LPX64
+                               ", client NID "LPX64"\n", client_cksum, cksum,
+                               req->rq_connection->c_peer.peer_nid);
+                        cksum_counter = 1;
+                } else {
+                        cksum_counter++;
+                        if ((cksum_counter & (-cksum_counter)) == cksum_counter)
+                                CERROR("Checksum %d from "LPX64": "LPX64" OK\n",
+                                        cksum_counter,
+                                        req->rq_connection->c_peer.peer_nid,
+                                        cksum);
+                }
+        }
+#endif
+
         req->rq_status = obd_commitrw(cmd, conn, objcount, ioo, niocount,
                                       local_nb, desc_priv, oti);
 
@@ -419,22 +491,117 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                      &req->rq_repmsg);
         if (rc)
                 ptlrpc_error(req->rq_svc, req);
-        else
+        else {
+                oti_to_request(oti, req);
                 rc = ptlrpc_reply(req->rq_svc, req);
+        }
         RETURN(rc);
 }
 
-inline void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req)
+static int ost_san_brw(struct ptlrpc_request *req, int alloc)
 {
-        if (oti && req->rq_repmsg)
-                req->rq_repmsg->transno = HTON__u64(oti->oti_transno);
-        EXIT;
+        struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
+        struct niobuf_remote *remote_nb, *res_nb;
+        struct obd_ioobj *ioo;
+        struct ost_body *body;
+        int cmd, rc, i, j, objcount, niocount, size[2] = {sizeof(*body)};
+        void *tmp1, *tmp2, *end2;
+        ENTRY;
+
+        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        tmp1 = lustre_msg_buf(req->rq_reqmsg, 1);
+        tmp2 = lustre_msg_buf(req->rq_reqmsg, 2);
+        end2 = (char *)tmp2 + req->rq_reqmsg->buflens[2];
+        objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo);
+        niocount = req->rq_reqmsg->buflens[2] / sizeof(*remote_nb);
+        
+        cmd = alloc ? OBD_BRW_WRITE : OBD_BRW_READ;
+
+        for (i = 0; i < objcount; i++) {
+                ost_unpack_ioo((void *)&tmp1, &ioo);
+                if (tmp2 + ioo->ioo_bufcnt > end2) {
+                        rc = -EFAULT;
+                        break;
+                }
+                for (j = 0; j < ioo->ioo_bufcnt; j++)
+                        ost_unpack_niobuf((void *)&tmp2, &remote_nb);
+        }
+
+        size[1] = niocount * sizeof(*remote_nb);
+        rc = lustre_pack_msg(2, size, NULL, &req->rq_replen, &req->rq_repmsg);
+        if (rc)
+                GOTO(out, rc);
+
+        /* The unpackers move tmp1 and tmp2, so reset them before using */
+        tmp1 = lustre_msg_buf(req->rq_reqmsg, 1);
+        tmp2 = lustre_msg_buf(req->rq_reqmsg, 2);
+
+        req->rq_status = obd_san_preprw(cmd, conn, objcount, tmp1,
+                                        niocount, tmp2);
+
+        if (req->rq_status) {
+                rc = 0;
+                goto out;
+        }
+
+        remote_nb = lustre_msg_buf(req->rq_repmsg, 1);
+        res_nb = lustre_msg_buf(req->rq_reqmsg, 2);
+        for (i = 0; i < niocount; i++) {
+                /* this advances remote_nb */
+                ost_pack_niobuf((void **)&remote_nb,
+                                res_nb[i].offset,
+                                res_nb[i].len, /* 0 */
+                                res_nb[i].flags, /* 0 */
+                                res_nb[i].xid
+                                );
+        }
+
+        rc = 0;
+
+out:
+        if (rc) {
+                OBD_FREE(req->rq_repmsg, req->rq_replen);
+                req->rq_repmsg = NULL;
+                ptlrpc_error(req->rq_svc, req);
+        } else
+                ptlrpc_reply(req->rq_svc, req);
+
+        return rc;
+}
+
+static int filter_recovery_request(struct ptlrpc_request *req,
+                                   struct obd_device *obd, int *process)
+{
+        switch (req->rq_reqmsg->opc) {
+        case OST_CONNECT: /* This will never get here, but for completeness. */
+        case OST_DISCONNECT:
+               *process = 1;
+               RETURN(0);
+
+        case OST_CLOSE:
+        case OST_CREATE:
+        case OST_DESTROY:
+        case OST_OPEN:
+        case OST_PUNCH:
+        case OST_SETATTR: 
+        case OST_SYNCFS:
+        case OST_WRITE:
+        case LDLM_ENQUEUE:
+                *process = target_queue_recovery_request(req, obd);
+                RETURN(0);
+
+        default:
+                DEBUG_REQ(D_ERROR, req, "not permitted during recovery");
+                *process = 0;
+                /* XXX what should we set rq_status to here? */
+                RETURN(ptlrpc_error(req->rq_svc, req));
+        }
 }
 
 static int ost_handle(struct ptlrpc_request *req)
 {
         struct obd_trans_info trans_info = { 0, }, *oti = &trans_info;
-        int rc;
+        int should_process, rc;
         ENTRY;
 
         rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
@@ -443,12 +610,44 @@ static int ost_handle(struct ptlrpc_request *req)
                 GOTO(out, rc);
         }
 
-        if (req->rq_reqmsg->opc != OST_CONNECT && req->rq_export == NULL) {
-                CERROR("lustre_ost: operation %d on unconnected OST\n",
-                       req->rq_reqmsg->opc);
-                req->rq_status = -ENOTCONN;
-                GOTO(out, rc = -ENOTCONN);
-        }
+        if (req->rq_reqmsg->opc != OST_CONNECT) {
+                struct obd_device *obd;
+
+                if (req->rq_export == NULL) {
+                        CERROR("lustre_ost: operation %d on unconnected OST\n",
+                               req->rq_reqmsg->opc);
+                        req->rq_status = -ENOTCONN;
+                        GOTO(out, rc = -ENOTCONN);
+                }
+
+                obd = req->rq_export->exp_obd;
+
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                if (obd->obd_flags & OBD_ABORT_RECOVERY)
+                        target_abort_recovery(obd);
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+
+                if (obd->obd_flags & OBD_RECOVERING) {
+                        rc = filter_recovery_request(req, obd, &should_process);
+                        if (rc || !should_process)
+                                RETURN(rc);
+                } else if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
+#if 0
+/* need to store this reply somewhere... */
+                        if (req->rq_xid == med->med_last_xid) {
+                                DEBUG_REQ(D_HA, req, "resending reply");
+                                OBD_ALLOC(req->rq_repmsg, med->med_last_replen);
+                                req->rq_replen = med->med_last_replen;
+                                memcpy(req->rq_repmsg, med->med_last_reply,
+                                       req->rq_replen);
+                                ptlrpc_reply(req->rq_svc, req);
+                                return 0;
+                        }
+                        DEBUG_REQ(D_HA, req, "no reply for resend, continuing");
+#endif
+                }
+
+        } 
 
         if (strcmp(req->rq_obd->obd_type->typ_name, "ost") != 0)
                 GOTO(out, rc = -EINVAL);
@@ -457,7 +656,7 @@ static int ost_handle(struct ptlrpc_request *req)
         case OST_CONNECT:
                 CDEBUG(D_INODE, "connect\n");
                 OBD_FAIL_RETURN(OBD_FAIL_OST_CONNECT_NET, 0);
-                rc = target_handle_connect(req);
+                rc = target_handle_connect(req, ost_handle);
                 break;
         case OST_DISCONNECT:
                 CDEBUG(D_INODE, "disconnect\n");
@@ -506,6 +705,18 @@ static int ost_handle(struct ptlrpc_request *req)
                 rc = ost_brw_read(req);
                 /* ost_brw sends its own replies */
                 RETURN(rc);
+        case OST_SAN_READ:
+                CDEBUG(D_INODE, "san read\n");
+                OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0);
+                rc = ost_san_brw(req, 0);
+                /* ost_san_brw sends its own replies */
+                RETURN(rc);
+        case OST_SAN_WRITE:
+                CDEBUG(D_INODE, "san write\n");
+                OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0);
+                rc = ost_san_brw(req, 1);
+                /* ost_san_brw sends its own replies */
+                RETURN(rc);
         case OST_PUNCH:
                 CDEBUG(D_INODE, "punch\n");
                 OBD_FAIL_RETURN(OBD_FAIL_OST_PUNCH_NET, 0);
@@ -516,6 +727,11 @@ static int ost_handle(struct ptlrpc_request *req)
                 OBD_FAIL_RETURN(OBD_FAIL_OST_STATFS_NET, 0);
                 rc = ost_statfs(req);
                 break;
+        case OST_SYNCFS:
+                CDEBUG(D_INODE, "sync\n");
+                OBD_FAIL_RETURN(OBD_FAIL_OST_SYNCFS_NET, 0);
+                rc = ost_syncfs(req);
+                break;
         case LDLM_ENQUEUE:
                 CDEBUG(D_INODE, "enqueue\n");
                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0);
@@ -561,7 +777,17 @@ static int ost_handle(struct ptlrpc_request *req)
         }
 
 out:
-        //req->rq_status = rc;
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
+                struct obd_device *obd = req->rq_export->exp_obd;
+
+                if (obd && (obd->obd_flags & OBD_RECOVERING)) {
+                        DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
+                        return target_queue_final_reply(req, rc);
+                }
+                /* Lost a race with recovery; let the error path DTRT. */
+                rc = req->rq_status = -ENOTCONN;
+        }
+
         if (rc) {
                 CERROR("ost: processing error (opcode=%d): %d\n",
                        req->rq_reqmsg->opc, rc);
@@ -583,7 +809,6 @@ out:
 static int ost_setup(struct obd_device *obddev, obd_count len, void *buf)
 {
         struct ost_obd *ost = &obddev->u.ost;
-        struct obd_uuid self = { "self" };
         int err;
         int i;
         ENTRY;
@@ -591,7 +816,7 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf)
         ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS,
                                            OST_BUFSIZE, OST_MAXREQSIZE,
                                            OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
-                                           &self, ost_handle, "ost");
+                                           ost_handle, "ost");
         if (!ost->ost_service) {
                 CERROR("failed to start service\n");
                 GOTO(error_disc, err = -ENOMEM);
@@ -639,9 +864,8 @@ int ost_detach(struct obd_device *dev)
         return lprocfs_obd_detach(dev);
 }
 
-/* This is so similar to mds_connect that it makes my heart weep: we should
- * shuffle the UUID into obd_export proper and make this all happen in
- * target_handle_connect.
+/* I don't think this function is ever used, since nothing 
+ * connects directly to this module.
  */
 static int ost_connect(struct lustre_handle *conn,
                        struct obd_device *obd, struct obd_uuid *cluuid,
@@ -649,41 +873,18 @@ static int ost_connect(struct lustre_handle *conn,
                        ptlrpc_recovery_cb_t recover)
 {
         struct obd_export *exp;
-        struct ost_export_data *oed;
-        struct list_head *p;
         int rc;
         ENTRY;
 
         if (!conn || !obd || !cluuid)
                 RETURN(-EINVAL);
 
-        /* lctl gets a backstage, all-access pass. */
-        if (!strcmp(cluuid->uuid, "OBD_CLASS_UUID"))
-                goto dont_check_exports;
-
-        spin_lock(&obd->obd_dev_lock);
-        list_for_each(p, &obd->obd_exports) {
-                exp = list_entry(p, struct obd_export, exp_obd_chain);
-                oed = &exp->exp_ost_data;
-                if (!memcmp(cluuid->uuid, oed->oed_uuid.uuid, 
-                            sizeof(oed->oed_uuid.uuid))) {
-                        spin_unlock(&obd->obd_dev_lock);
-                        LASSERT(exp->exp_obd == obd);
-
-                        RETURN(target_handle_reconnect(conn, exp, cluuid));
-                }
-        }
-
- dont_check_exports:
         rc = class_connect(conn, obd, cluuid);
         if (rc)
                 RETURN(rc);
         exp = class_conn2export(conn);
         LASSERT(exp);
 
-        oed = &exp->exp_ost_data;
-        memcpy(oed->oed_uuid.uuid, cluuid->uuid, sizeof(oed->oed_uuid.uuid));
-
         RETURN(0);
 }
 
index a6580e0..67d0b85 100644 (file)
@@ -67,7 +67,6 @@ static int ptlbd_cl_setup(struct obd_device *obddev, obd_count len, void *buf)
          * from client_obd_connect.. *shrug*
          */
         INIT_LIST_HEAD(&imp->imp_chain);
-        imp->imp_last_xid = 0;
         imp->imp_max_transno = 0;
         imp->imp_peer_committed_transno = 0;
         imp->imp_level = LUSTRE_CONN_FULL;
@@ -107,9 +106,7 @@ static int ptlbd_cl_connect(struct lustre_handle *conn, struct obd_device *obd,
                 RETURN(rc);
 
         INIT_LIST_HEAD(&imp->imp_chain);
-        imp->imp_last_xid = 0;
         imp->imp_max_transno = 0;
-        imp->imp_peer_last_xid = 0;
         imp->imp_peer_committed_transno = 0;
         imp->imp_level = LUSTRE_CONN_FULL;
 
index 62c0236..4daee83 100644 (file)
@@ -42,7 +42,6 @@ int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd,
         struct ptlrpc_request *req;
         struct ptlrpc_bulk_desc *desc;
         struct buffer_head *bh;
-        unsigned long flags;
         unsigned int page_count;
         int rc, rep_size, size[2];
         __u32 xid;
@@ -76,9 +75,7 @@ int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd,
         desc->bd_portal = PTLBD_BULK_PORTAL;
         desc->bd_ptl_ev_hdlr = NULL;
 
-        spin_lock_irqsave(&imp->imp_lock, flags);
-        xid = ++imp->imp_last_xid;
-        spin_unlock_irqrestore(&imp->imp_lock, flags);
+        xid = ptlrpc_next_xid();
 
         for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_next, niob++ ) {
                 struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
index 78d01a6..793354d 100644 (file)
@@ -36,7 +36,6 @@ static int ptlbd_sv_already_setup = 1;
 
 static int ptlbd_sv_setup(struct obd_device *obddev, obd_count len, void *buf)
 {
-        struct obd_uuid self_uuid = { "self" };
         struct ptlbd_obd *ptlbd = &obddev->u.ptlbd;
         int rc;
         ENTRY;
@@ -49,7 +48,7 @@ static int ptlbd_sv_setup(struct obd_device *obddev, obd_count len, void *buf)
         ptlbd->ptlbd_service =
                 ptlrpc_init_svc(PTLBD_NEVENTS, PTLBD_NBUFS, PTLBD_BUFSIZE,
                                 PTLBD_MAXREQSIZE, PTLBD_REQUEST_PORTAL,
-                                PTLBD_REPLY_PORTAL, &self_uuid,
+                                PTLBD_REPLY_PORTAL,
                                 ptlbd_parse_req, "ptlbd_sv");
 
         if (ptlbd->ptlbd_service == NULL) 
index dd3f9d8..446f110 100644 (file)
@@ -5,10 +5,15 @@
 
 DEFS=
 
+if LIBLUSTRE
+lib_LIBRARIES = libptlrpc.a
+libptlrpc_a_SOURCES = client.c niobuf.c pack_generic.c recovd.c recover.c connection.c rpc.c events.c  # lproc_ptlrpc.c service.c
+else
 MODULE = ptlrpc
 modulefs_DATA = ptlrpc.o
 EXTRA_PROGRAMS = ptlrpc
 
 ptlrpc_SOURCES = recovd.c recover.c connection.c rpc.c events.c service.c client.c niobuf.c pack_generic.c lproc_ptlrpc.c
+endif
 
 include $(top_srcdir)/Rules
index 48e11b5..7d80d5f 100644 (file)
  */
 
 #define DEBUG_SUBSYSTEM S_RPC
+#ifndef __KERNEL__
+#include <errno.h>
+#include <signal.h>
+#include <liblustre.h>
+#endif
 
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
@@ -44,10 +49,10 @@ struct obd_uuid *ptlrpc_req_to_uuid(struct ptlrpc_request *req)
 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
 {
         struct ptlrpc_connection *c;
-        struct lustre_peer peer;
+        struct ptlrpc_peer peer;
         int err;
 
-        err = kportal_uuid_to_peer(uuid->uuid, &peer);
+        err = ptlrpc_uuid_to_peer(uuid, &peer);
         if (err != 0) {
                 CERROR("cannot find peer %s!\n", uuid->uuid);
                 return NULL;
@@ -67,16 +72,16 @@ struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
 
 void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,struct obd_uuid *uuid)
 {
-        struct lustre_peer peer;
+        struct ptlrpc_peer peer;
         int err;
 
-        err = kportal_uuid_to_peer(uuid->uuid, &peer);
+        err = ptlrpc_uuid_to_peer (uuid, &peer);
         if (err != 0) {
                 CERROR("cannot find peer %s!\n", uuid->uuid);
                 return;
         }
 
-        memcpy(&conn->c_peer, &peer, sizeof(peer));
+        memcpy (&conn->c_peer, &peer, sizeof (peer));
         return;
 }
 
@@ -183,11 +188,19 @@ static int ll_sync_brw_timeout(void *data)
 
                 LASSERT(desc->bd_connection);
 
-                /* If PtlMDUnlink succeeds, then it hasn't completed yet.  If it
-                 * fails, the bulk finished _just_ in time (after the timeout
-                 * fired but before we got this far) and we'll let it live.
+                /* If PtlMDUnlink succeeds, then bulk I/O on the MD hasn't
+                 * even started yet.  XXX where do we kunmup the thing?
+                 *
+                 * If it fail with PTL_MD_BUSY, then the network is still
+                 * reading/writing the buffers and we must wait for it to
+                 * complete (which it will within finite time, most
+                 * probably with failure; we really need portals error
+                 * events to detect that).
+                 *
+                 * Otherwise (PTL_INV_MD) it completed after the bd_flags
+                 * test above!
                  */
-                if (PtlMDUnlink(desc->bd_md_h) != 0) {
+                if (PtlMDUnlink(desc->bd_md_h) != PTL_OK) {
                         CERROR("Near-miss on OST %s -- need to adjust "
                                "obd_timeout?\n",
                                desc->bd_connection->c_remote_uuid.uuid);
@@ -311,13 +324,25 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
                 return;
         }
 
+        /* We must take it off the imp_replay_list first.  Otherwise, we'll set
+         * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
+        if (request->rq_import) {
+                unsigned long flags = 0;
+                if (!locked)
+                        spin_lock_irqsave(&request->rq_import->imp_lock, flags);
+                list_del_init(&request->rq_list);
+                if (!locked)
+                        spin_unlock_irqrestore(&request->rq_import->imp_lock,
+                                               flags);
+        }
+
         if (atomic_read(&request->rq_refcount) != 0) {
                 CERROR("freeing request %p (%d->%s:%d) with refcount %d\n",
                        request, request->rq_reqmsg->opc,
                        request->rq_connection->c_remote_uuid.uuid,
                        request->rq_import->imp_client->cli_request_portal,
                        atomic_read (&request->rq_refcount));
-                /* LBUG(); */
+                LBUG();
         }
 
         if (request->rq_repmsg != NULL) {
@@ -330,16 +355,6 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
                 request->rq_reqmsg = NULL;
         }
 
-        if (request->rq_import) {
-                unsigned long flags = 0;
-                if (!locked)
-                        spin_lock_irqsave(&request->rq_import->imp_lock, flags);
-                list_del_init(&request->rq_list);
-                if (!locked)
-                        spin_unlock_irqrestore(&request->rq_import->imp_lock,
-                                               flags);
-        }
-
         ptlrpc_put_connection(request->rq_connection);
         OBD_FREE(request, sizeof(*request));
         EXIT;
@@ -392,7 +407,6 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req)
         }
 
         if (req->rq_flags & PTL_RPC_FL_RESEND) {
-                ENTRY;
                 DEBUG_REQ(D_ERROR, req, "RESEND:");
                 GOTO(out, rc = 1);
         }
@@ -442,7 +456,7 @@ static void ptlrpc_cleanup_request_buf(struct ptlrpc_request *request)
 }
 
 /* Abort this request and cleanup any resources associated with it. */
-static int ptlrpc_abort(struct ptlrpc_request *request)
+int ptlrpc_abort(struct ptlrpc_request *request)
 {
         /* First remove the ME for the reply; in theory, this means
          * that we can tear down the buffer safely. */
@@ -469,8 +483,8 @@ void ptlrpc_free_committed(struct obd_import *imp)
         LASSERT(spin_is_locked(&imp->imp_lock));
 #endif
 
-        CDEBUG(D_HA, "committing for last_committed "LPU64"\n",
-               imp->imp_peer_committed_transno);
+        CDEBUG(D_HA, "%s: committing for last_committed "LPU64"\n",
+               imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
 
         list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
@@ -524,17 +538,14 @@ void ptlrpc_cleanup_client(struct obd_import *imp)
 
 void ptlrpc_continue_req(struct ptlrpc_request *req)
 {
-        ENTRY;
         DEBUG_REQ(D_HA, req, "continuing delayed request");
         req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
         req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
         wake_up(&req->rq_wait_for_rep);
-        EXIT;
 }
 
 void ptlrpc_resend_req(struct ptlrpc_request *req)
 {
-        ENTRY;
         DEBUG_REQ(D_HA, req, "resending");
         req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
         req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
@@ -543,18 +554,15 @@ void ptlrpc_resend_req(struct ptlrpc_request *req)
         req->rq_flags |= PTL_RPC_FL_RESEND;
         req->rq_flags &= ~PTL_RPC_FL_TIMEOUT;
         wake_up(&req->rq_wait_for_rep);
-        EXIT;
 }
 
 void ptlrpc_restart_req(struct ptlrpc_request *req)
 {
-        ENTRY;
         DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
         req->rq_status = -ERESTARTSYS;
         req->rq_flags |= PTL_RPC_FL_RESTART;
         req->rq_flags &= ~PTL_RPC_FL_TIMEOUT;
         wake_up(&req->rq_wait_for_rep);
-        EXIT;
 }
 
 static int expired_request(void *data)
@@ -659,15 +667,14 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
 
         init_waitqueue_head(&req->rq_wait_for_rep);
 
-        spin_lock_irqsave(&imp->imp_lock, flags);
-        req->rq_xid = HTON__u32(++imp->imp_last_xid);
-        spin_unlock_irqrestore(&imp->imp_lock, flags);
+        req->rq_xid = HTON__u32(ptlrpc_next_xid());
 
         /* for distributed debugging */
         req->rq_reqmsg->status = HTON__u32(current->pid);
-        CDEBUG(D_RPCTRACE, "Sending RPC pid:xid:nid:opc %d:"LPU64":%x:%d\n",
-               NTOH__u32(req->rq_reqmsg->status), req->rq_xid,
-               conn->c_peer.peer_nid, NTOH__u32(req->rq_reqmsg->opc));
+        CDEBUG(D_RPCTRACE, "Sending RPC pid:xid:nid:opc %d:"LPU64":%s:"LPX64
+               ":%d\n", NTOH__u32(req->rq_reqmsg->status), req->rq_xid,
+               conn->c_peer.peer_ni->pni_name, conn->c_peer.peer_nid,
+               NTOH__u32(req->rq_reqmsg->opc));
 
         spin_lock_irqsave(&imp->imp_lock, flags);
 
@@ -726,7 +733,17 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 lwi = LWI_TIMEOUT_INTR(obd_timeout * HZ, expired_request,
                                        interrupted_request, req);
         }
+#ifdef __KERNEL__
         l_wait_event(req->rq_wait_for_rep, ptlrpc_check_reply(req), &lwi);
+#else 
+        { 
+                extern int reply_in_callback(ptl_event_t *ev);
+                ptl_event_t reply_ev;
+                PtlEQWait(req->rq_connection->c_peer.peer_ni->pni_reply_in_eq_h, &reply_ev);
+                reply_in_callback(&reply_ev); 
+        }
+#endif 
+
         DEBUG_REQ(D_NET, req, "-- done sleeping");
 
         spin_lock_irqsave(&imp->imp_lock, flags);
@@ -741,6 +758,11 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
         /* Don't resend if we were interrupted. */
         if ((req->rq_flags & (PTL_RPC_FL_RESEND | PTL_RPC_FL_INTR)) ==
             PTL_RPC_FL_RESEND) {
+                if (req->rq_flags & PTL_RPC_FL_NO_RESEND) {
+                        ptlrpc_abort(req); /* clean up reply buffers */
+                        req->rq_flags &= ~PTL_RPC_FL_NO_RESEND;
+                        GOTO(out, rc = -ETIMEDOUT);
+                }
                 req->rq_flags &= ~PTL_RPC_FL_RESEND;
                 lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
                 DEBUG_REQ(D_HA, req, "resending: ");
@@ -900,9 +922,11 @@ void ptlrpc_abort_inflight(struct obd_import *imp, int dying_import)
          * ptlrpc_queue_wait must (and does) hold imp_lock while testing this
          * flag and then putting requests on sending_list or delayed_list.
          */
-        spin_lock_irqsave(&imp->imp_lock, flags);
-        imp->imp_flags |= IMP_INVALID;
-        spin_unlock_irqrestore(&imp->imp_lock, flags);
+        if ((imp->imp_flags & IMP_REPLAYABLE) == 0) {
+                spin_lock_irqsave(&imp->imp_lock, flags);
+                imp->imp_flags |= IMP_INVALID;
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+        }
 
         list_for_each_safe(tmp, n, &imp->imp_sending_list) {
                 struct ptlrpc_request *req =
index b2d204d..8f2cc2d 100644 (file)
  */
 
 #define DEBUG_SUBSYSTEM S_RPC
-
+#ifdef __KERNEL__
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
 #include <linux/lustre_net.h>
+#else
+#include <liblustre.h>
+#endif
 
 static spinlock_t conn_lock;
 static struct list_head conn_list;
@@ -32,32 +35,34 @@ static struct list_head conn_unused_list;
 
 /* If UUID is NULL, c->c_remote_uuid must be all zeroes
  * If UUID is non-NULL, c->c_remote_uuid must match. */
-static int match_connection_uuid(struct ptlrpc_connection *c, struct obd_uuid *uuid)
+static int match_connection_uuid(struct ptlrpc_connection *c,
+                                 struct obd_uuid *uuid)
 {
         struct obd_uuid zero_uuid;
         memset(&zero_uuid, 0, sizeof(zero_uuid));
 
         if (uuid)
-                return memcmp(c->c_remote_uuid.uuid, uuid->uuid, 
+                return memcmp(c->c_remote_uuid.uuid, uuid->uuid,
                               sizeof(uuid->uuid));
 
         return memcmp(c->c_remote_uuid.uuid, &zero_uuid, sizeof(zero_uuid));
 }
 
-struct ptlrpc_connection *ptlrpc_get_connection(struct lustre_peer *peer,
+struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
                                                 struct obd_uuid *uuid)
 {
         struct list_head *tmp, *pos;
         struct ptlrpc_connection *c;
         ENTRY;
 
-        CDEBUG(D_INFO, "peer is %08x %08lx %08lx\n",
-               peer->peer_nid, peer->peer_ni.nal_idx, peer->peer_ni.handle_idx);
+        CDEBUG(D_INFO, "peer is "LPX64" on %s\n",
+               peer->peer_nid, peer->peer_ni->pni_name);
 
         spin_lock(&conn_lock);
         list_for_each(tmp, &conn_list) {
                 c = list_entry(tmp, struct ptlrpc_connection, c_link);
-                if (memcmp(peer, &c->c_peer, sizeof(*peer)) == 0 &&
+                if (peer->peer_nid == c->c_peer.peer_nid &&
+                    peer->peer_ni == c->c_peer.peer_ni &&
                     !match_connection_uuid(c, uuid)) {
                         ptlrpc_connection_addref(c);
                         GOTO(out, c);
@@ -66,7 +71,8 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct lustre_peer *peer,
 
         list_for_each_safe(tmp, pos, &conn_unused_list) {
                 c = list_entry(tmp, struct ptlrpc_connection, c_link);
-                if (memcmp(peer, &c->c_peer, sizeof(*peer)) == 0 &&
+                if (peer->peer_nid == c->c_peer.peer_nid &&
+                    peer->peer_ni == c->c_peer.peer_ni &&
                     !match_connection_uuid(c, uuid)) {
                         ptlrpc_connection_addref(c);
                         list_del(&c->c_link);
@@ -93,10 +99,11 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct lustre_peer *peer,
         INIT_LIST_HEAD(&c->c_recovd_data.rd_managed_chain);
         INIT_LIST_HEAD(&c->c_delayed_head);
         atomic_set(&c->c_refcount, 0);
-        ptlrpc_connection_addref(c);
+        memcpy(&c->c_peer, peer, sizeof(c->c_peer));
         spin_lock_init(&c->c_lock);
 
-        memcpy(&c->c_peer, peer, sizeof(c->c_peer));
+        ptlrpc_connection_addref(c);
+
         list_add(&c->c_link, &conn_list);
 
         EXIT;
@@ -115,8 +122,10 @@ int ptlrpc_put_connection(struct ptlrpc_connection *c)
                 RETURN(0);
         }
 
-        CDEBUG(D_INFO, "connection=%p refcount %d\n",
-               c, atomic_read(&c->c_refcount) - 1);
+        CDEBUG (D_INFO, "connection=%p refcount %d to "LPX64" on %s\n",
+                c, atomic_read(&c->c_refcount), c->c_peer.peer_nid,
+                c->c_peer.peer_ni->pni_name);
+
         if (atomic_dec_and_test(&c->c_refcount)) {
                 recovd_conn_unmanage(c);
                 spin_lock(&conn_lock);
@@ -135,9 +144,10 @@ int ptlrpc_put_connection(struct ptlrpc_connection *c)
 struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *c)
 {
         ENTRY;
-        CDEBUG(D_INFO, "connection=%p refcount %d\n",
-               c, atomic_read(&c->c_refcount) + 1);
         atomic_inc(&c->c_refcount);
+        CDEBUG (D_INFO, "connection=%p refcount %d to "LPX64" on %s\n",
+                c, atomic_read(&c->c_refcount), c->c_peer.peer_nid,
+                c->c_peer.peer_ni->pni_name);
         RETURN(c);
 }
 
@@ -161,9 +171,9 @@ void ptlrpc_cleanup_connection(void)
         }
         list_for_each_safe(tmp, pos, &conn_list) {
                 c = list_entry(tmp, struct ptlrpc_connection, c_link);
-                CERROR("Connection %p/%s has refcount %d (nid=%lu)\n",
+                CERROR("Connection %p/%s has refcount %d (nid="LPX64" on %s)\n",
                        c, c->c_remote_uuid.uuid, atomic_read(&c->c_refcount),
-                       (unsigned long)c->c_peer.peer_nid);
+                       c->c_peer.peer_nid, c->c_peer.peer_ni->pni_name);
                 list_del(&c->c_link);
                 OBD_FREE(c, sizeof(*c));
         }
index e7a1e08..4a6eb67 100644 (file)
 
 #define DEBUG_SUBSYSTEM S_RPC
 
+#ifdef __KERNEL__
 #include <linux/module.h>
-#include <linux/obd_support.h>
+#else
+#include <liblustre.h>
+#endif
+#include <linux/obd_class.h>
 #include <linux/lustre_net.h>
 
-ptl_handle_eq_t request_out_eq, reply_in_eq, reply_out_eq, 
-        bulk_put_source_eq, bulk_put_sink_eq, 
-        bulk_get_source_eq, bulk_get_sink_eq;
-static const ptl_handle_ni_t *socknal_nip = NULL, *toenal_nip = NULL, 
-        *qswnal_nip = NULL, *gmnal_nip = NULL;
+struct ptlrpc_ni  ptlrpc_interfaces[NAL_MAX_NR];
+int               ptlrpc_ninterfaces;
 
 /*
  *  Free the packet when it has gone out
@@ -67,8 +68,17 @@ static int reply_out_callback(ptl_event_t *ev)
 
         if (ev->type == PTL_EVENT_SENT) {
                 OBD_FREE(ev->mem_desc.start, ev->mem_desc.length);
+        } else if (ev->type == PTL_EVENT_ACK) {
+                struct ptlrpc_request *req = ev->mem_desc.user_ptr;
+                if (req->rq_flags & PTL_RPC_FL_WANT_ACK) {
+                        req->rq_flags &= ~PTL_RPC_FL_WANT_ACK;
+                        wake_up(&req->rq_wait_for_rep);
+                } else {
+                        DEBUG_REQ(D_ERROR, req,
+                                  "ack received for reply, not wanted");
+                }
         } else {
-                // XXX make sure we understand all events, including ACK's
+                // XXX make sure we understand all events
                 CERROR("Unknown event %d\n", ev->type);
                 LBUG();
         }
@@ -79,7 +89,7 @@ static int reply_out_callback(ptl_event_t *ev)
 /*
  * Wake up the thread waiting for the reply once it comes in.
  */
-static int reply_in_callback(ptl_event_t *ev)
+int reply_in_callback(ptl_event_t *ev)
 {
         struct ptlrpc_request *req = ev->mem_desc.user_ptr;
         ENTRY;
@@ -114,13 +124,14 @@ static int reply_in_callback(ptl_event_t *ev)
 int request_in_callback(ptl_event_t *ev)
 {
         struct ptlrpc_request_buffer_desc *rqbd = ev->mem_desc.user_ptr;
-        struct ptlrpc_service *service = rqbd->rqbd_service;
+        struct ptlrpc_srv_ni  *srv_ni = rqbd->rqbd_srv_ni;
+        struct ptlrpc_service *service = srv_ni->sni_service;
 
         /* requests always contiguous */
         LASSERT((ev->mem_desc.options & PTL_MD_IOV) == 0);
         /* we only enable puts */
         LASSERT(ev->type == PTL_EVENT_PUT);
-        LASSERT(atomic_read(&service->srv_nrqbds_receiving) > 0);
+        LASSERT(atomic_read(&srv_ni->sni_nrqbds_receiving) > 0);
         LASSERT(atomic_read(&rqbd->rqbd_refcount) > 0);
 
         if (ev->rlength != ev->mlength)
@@ -138,7 +149,7 @@ int request_in_callback(ptl_event_t *ev)
 
                 /* we're off the air */
                 /* we'll probably start dropping packets in portals soon */
-                if (atomic_dec_and_test(&service->srv_nrqbds_receiving))
+                if (atomic_dec_and_test(&srv_ni->sni_nrqbds_receiving))
                         CERROR("All request buffers busy\n");
         } else {
                 /* +1 ref for service thread */
@@ -336,74 +347,153 @@ static int bulk_get_sink_callback(ptl_event_t *ev)
         RETURN(0);
 }
 
-int ptlrpc_init_portals(void)
+int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer) 
 {
-        int rc;
-        ptl_handle_ni_t ni;
-
-        /* Use the qswnal if it's there */
-        if ((qswnal_nip = inter_module_get("kqswnal_ni")) != NULL)
-                ni = *qswnal_nip;
-        else if ((gmnal_nip = inter_module_get("kgmnal_ni")) != NULL)
-                ni = *gmnal_nip;
-        else if ((socknal_nip = inter_module_get("ksocknal_ni")) != NULL)
-                ni = *socknal_nip;
-        else if ((toenal_nip = inter_module_get("ktoenal_ni")) != NULL)
-                ni = *toenal_nip;
-        else {
-                CERROR("get_ni failed: is a NAL module loaded?\n");
-                return -EIO;
+        struct ptlrpc_ni   *pni;
+        struct lustre_peer  lpeer;
+        int                 i;
+        int                 rc = lustre_uuid_to_peer (uuid->uuid, &lpeer);
+        
+        if (rc != 0)
+                RETURN (rc);
+        
+        for (i = 0; i < ptlrpc_ninterfaces; i++) {
+                pni = &ptlrpc_interfaces[i];
+
+                if (!memcmp (&lpeer.peer_ni, &pni->pni_ni_h,
+                             sizeof (lpeer.peer_ni))) {
+                        peer->peer_nid = lpeer.peer_nid;
+                        peer->peer_ni = pni;
+                        return (0);
+                }
         }
+        
+        CERROR ("Can't find ptlrpc interface for "LPX64" ni handle %08lx %08lx\n",
+                lpeer.peer_nid, lpeer.peer_ni.nal_idx, lpeer.peer_ni.handle_idx);
+        return (-ENOENT);
+}
 
-        rc = PtlEQAlloc(ni, 1024, request_out_callback, &request_out_eq);
-        if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
+void ptlrpc_ni_fini (struct ptlrpc_ni *pni) 
+{
+        PtlEQFree(pni->pni_request_out_eq_h);
+        PtlEQFree(pni->pni_reply_out_eq_h);
+        PtlEQFree(pni->pni_reply_in_eq_h);
+        PtlEQFree(pni->pni_bulk_put_source_eq_h);
+        PtlEQFree(pni->pni_bulk_put_sink_eq_h);
+        PtlEQFree(pni->pni_bulk_get_source_eq_h);
+        PtlEQFree(pni->pni_bulk_get_sink_eq_h);
+        
+        inter_module_put(pni->pni_name);
+}
 
-        rc = PtlEQAlloc(ni, 1024, reply_out_callback, &reply_out_eq);
-        if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
+int ptlrpc_ni_init (char *name, struct ptlrpc_ni *pni) 
+{
+        int              rc;
+        ptl_handle_ni_t *nip;
 
-        rc = PtlEQAlloc(ni, 1024, reply_in_callback, &reply_in_eq);
+        nip = (ptl_handle_ni_t *)inter_module_get (name);
+        if (nip == NULL) {
+                CDEBUG (D_NET, "Network interface %s not loaded\n", name);
+                return (-ENOENT);
+        }
+        
+        CDEBUG (D_NET, "init %s: nal_idx %ld\n", name, nip->nal_idx);
+                
+        pni->pni_name = name;
+        pni->pni_ni_h = *nip;
+
+        ptl_set_inv_handle (&pni->pni_request_out_eq_h);
+        ptl_set_inv_handle (&pni->pni_reply_out_eq_h);
+        ptl_set_inv_handle (&pni->pni_reply_in_eq_h);
+        ptl_set_inv_handle (&pni->pni_bulk_put_source_eq_h);
+        ptl_set_inv_handle (&pni->pni_bulk_put_sink_eq_h);
+        ptl_set_inv_handle (&pni->pni_bulk_get_source_eq_h);
+        ptl_set_inv_handle (&pni->pni_bulk_get_sink_eq_h);
+        
+        /* NB We never actually PtlEQGet() out of these events queues since
+         * we're only interested in the event callback, so we can just let
+         * them wrap.  Their sizes aren't a big deal, apart from providing
+         * a little history for debugging... */
+        
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, request_out_callback, 
+                        &pni->pni_request_out_eq_h);
         if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
-
-        rc = PtlEQAlloc(ni, 1024, bulk_put_source_callback, 
-                        &bulk_put_source_eq);
+                GOTO (fail, rc = -ENOMEM);
+                
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_out_callback, 
+                        &pni->pni_reply_out_eq_h);
         if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
-
-        rc = PtlEQAlloc(ni, 1024, bulk_put_sink_callback, &bulk_put_sink_eq);
+                GOTO (fail, rc = -ENOMEM);
+        
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_in_callback,
+                        &pni->pni_reply_in_eq_h);
         if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
-
-        rc = PtlEQAlloc(ni, 1024, bulk_get_source_callback, 
-                        &bulk_get_source_eq);
+                GOTO (fail, rc = -ENOMEM);
+                
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_source_callback,
+                        &pni->pni_bulk_put_source_eq_h);
         if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
-
-        rc = PtlEQAlloc(ni, 1024, bulk_get_sink_callback, &bulk_get_sink_eq);
+                GOTO (fail, rc = -ENOMEM);
+                
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_sink_callback,
+                        &pni->pni_bulk_put_sink_eq_h);
         if (rc != PTL_OK)
-                CERROR("PtlEQAlloc failed: %d\n", rc);
+                GOTO (fail, rc = -ENOMEM);
+                
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_source_callback,
+                        &pni->pni_bulk_get_source_eq_h);
+        if (rc != PTL_OK)
+                GOTO (fail, rc = -ENOMEM);
+                
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_sink_callback,
+                        &pni->pni_bulk_get_sink_eq_h);
+        if (rc != PTL_OK)
+                GOTO (fail, rc = -ENOMEM);
+        
+        return (0);
+ fail: 
+        CERROR ("Failed to initialise network interface %s: %d\n",
+                name, rc);
+
+        /* OK to do complete teardown since we invalidated the handles above... */
+        ptlrpc_ni_fini (pni);
+        return (rc);
+}
 
-        return rc;
+int ptlrpc_init_portals(void)
+{
+        /* Add new portals network interface names here.
+         * Order is irrelevent! */
+        char *ni_names[] = { "kqswnal_ni",
+                             "kgmnal_ni",
+                             "ksocknal_ni",
+                             "ktoenal_ni",
+                             "tcpnal_ni",
+                             NULL };
+        int   rc;
+        int   i;
+        
+        LASSERT (ptlrpc_ninterfaces == 0);
+
+        for (i = 0; ni_names[i] != NULL; i++) {
+                LASSERT (ptlrpc_ninterfaces < 
+                         sizeof (ptlrpc_interfaces)/sizeof (ptlrpc_interfaces[0]));
+                
+                rc = ptlrpc_ni_init (ni_names[i],
+                                     &ptlrpc_interfaces[ptlrpc_ninterfaces]);
+                if (rc == 0)
+                        ptlrpc_ninterfaces++;
+        }
+        
+        if (ptlrpc_ninterfaces == 0) {
+                CERROR("network initialisation failed: is a NAL module loaded?\n");
+                return -EIO;
+        }
+        return 0;
 }
 
 void ptlrpc_exit_portals(void)
 {
-        PtlEQFree(request_out_eq);
-        PtlEQFree(reply_out_eq);
-        PtlEQFree(reply_in_eq);
-        PtlEQFree(bulk_put_source_eq);
-        PtlEQFree(bulk_put_sink_eq);
-        PtlEQFree(bulk_get_source_eq);
-        PtlEQFree(bulk_get_sink_eq);
-
-        if (qswnal_nip != NULL)
-                inter_module_put("kqswnal_ni");
-        if (socknal_nip != NULL)
-                inter_module_put("ksocknal_ni");
-        if (gmnal_nip != NULL)
-                inter_module_put("kgmnal_ni");
-        if (toenal_nip != NULL)
-                inter_module_put("ktoenal_ni");
+        while (ptlrpc_ninterfaces > 0)
+                ptlrpc_ni_fini (&ptlrpc_interfaces[--ptlrpc_ninterfaces]);
 }
index ef3a215..62a76c4 100644 (file)
  */
 
 #define DEBUG_SUBSYSTEM S_RPC
-
+#ifndef __KERNEL__
+#include <liblustre.h>
+#include <portals/lib-types.h>
+#endif
 #include <linux/obd_support.h>
 #include <linux/lustre_net.h>
 #include <linux/lustre_lib.h>
 #include <linux/obd.h>
 
-extern ptl_handle_eq_t request_out_eq, reply_in_eq, reply_out_eq,
-        bulk_put_source_eq, bulk_put_sink_eq, 
-        bulk_get_source_eq, bulk_get_sink_eq;
-
 static int ptl_send_buf(struct ptlrpc_request *request,
                         struct ptlrpc_connection *conn, int portal)
 {
         int rc;
         ptl_process_id_t remote_id;
         ptl_handle_md_t md_h;
+        ptl_ack_req_t ack_req;
 
         LASSERT(conn);
+        CDEBUG (D_INFO, "conn=%p ni %s nid "LPX64" on %s\n", 
+                conn, conn->c_peer.peer_ni->pni_name,
+                conn->c_peer.peer_nid, conn->c_peer.peer_ni->pni_name);
 
         request->rq_req_md.user_ptr = request;
 
@@ -47,24 +50,35 @@ static int ptl_send_buf(struct ptlrpc_request *request,
                 request->rq_reqmsg->type = HTON__u32(request->rq_type);
                 request->rq_req_md.start = request->rq_reqmsg;
                 request->rq_req_md.length = request->rq_reqlen;
-                request->rq_req_md.eventq = request_out_eq;
+                request->rq_req_md.eventq = conn->c_peer.peer_ni->pni_request_out_eq_h;
                 break;
         case PTL_RPC_MSG_ERR:
         case PTL_RPC_MSG_REPLY:
                 request->rq_repmsg->type = HTON__u32(request->rq_type);
                 request->rq_req_md.start = request->rq_repmsg;
                 request->rq_req_md.length = request->rq_replen;
-                request->rq_req_md.eventq = reply_out_eq;
+                request->rq_req_md.eventq = conn->c_peer.peer_ni->pni_reply_out_eq_h;
                 break;
         default:
                 LBUG();
                 return -1; /* notreached */
         }
-        request->rq_req_md.threshold = 1;
+        if (request->rq_flags & PTL_RPC_FL_WANT_ACK) {
+                request->rq_req_md.threshold = 2; /* SENT and ACK */
+                ack_req = PTL_ACK_REQ;
+        } else {
+                request->rq_req_md.threshold = 1;
+                ack_req = PTL_NOACK_REQ;
+        }
         request->rq_req_md.options = PTL_MD_OP_PUT;
         request->rq_req_md.user_ptr = request;
 
-        rc = PtlMDBind(conn->c_peer.peer_ni, request->rq_req_md, &md_h);
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_ACK | OBD_FAIL_ONCE)) {
+                request->rq_req_md.options |= PTL_MD_ACK_DISABLE;
+                obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
+        }
+
+        rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h, request->rq_req_md, &md_h);
         if (rc != 0) {
                 CERROR("PtlMDBind failed: %d\n", rc);
                 LBUG();
@@ -79,8 +93,7 @@ static int ptl_send_buf(struct ptlrpc_request *request,
 
         if (!portal)
                 LBUG();
-        rc = PtlPut(md_h, PTL_NOACK_REQ, remote_id, portal, 0, request->rq_xid,
-                    0, 0);
+        rc = PtlPut(md_h, ack_req, remote_id, portal, 0, request->rq_xid, 0, 0);
         if (rc != PTL_OK) {
                 CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n",
                        remote_id.nid, portal, request->rq_xid, rc);
@@ -117,6 +130,7 @@ ptlrpc_put_bulk_iov (struct ptlrpc_bulk_desc *desc, struct iovec *iov)
 int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
 {
         int rc;
+        struct ptlrpc_peer *peer;
         struct list_head *tmp, *next;
         ptl_process_id_t remote_id;
         __u32 xid = 0;
@@ -127,10 +141,12 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
         if (iov == NULL)
                 RETURN (-ENOMEM);
 
+        peer = &desc->bd_connection->c_peer;
+
         desc->bd_md.start = iov;
         desc->bd_md.niov = 0;
         desc->bd_md.length = 0;
-        desc->bd_md.eventq = bulk_put_source_eq;
+        desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_source_eq_h;
         desc->bd_md.threshold = 2; /* SENT and ACK */
         desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_IOV;
         desc->bd_md.user_ptr = desc;
@@ -164,7 +180,7 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
         LASSERT(desc->bd_md.niov == desc->bd_page_count);
         LASSERT(desc->bd_md.niov != 0);
 
-        rc = PtlMDBind(desc->bd_connection->c_peer.peer_ni, desc->bd_md,
+        rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md,
                        &desc->bd_md_h);
 
         ptlrpc_put_bulk_iov (desc, iov); /*move down to reduce latency to send*/
@@ -175,12 +191,14 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
                 RETURN(rc);
         }
 
-        remote_id.nid = desc->bd_connection->c_peer.peer_nid;
+        remote_id.nid = peer->peer_nid;
         remote_id.pid = 0;
 
-        CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d nid "LPX64" pid "
-               "%d xid %d\n", desc->bd_md.niov, desc->bd_md.length,
-               desc->bd_portal, remote_id.nid, remote_id.pid, xid);
+        CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d on %s "
+               "nid "LPX64" pid %d xid %d\n", 
+               desc->bd_md.niov, desc->bd_md.length,
+               desc->bd_portal, peer->peer_ni->pni_name,
+               remote_id.nid, remote_id.pid, xid);
 
         rc = PtlPut(desc->bd_md_h, PTL_ACK_REQ, remote_id,
                     desc->bd_portal, 0, xid, 0, 0);
@@ -198,6 +216,7 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
 int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
 {
         int rc;
+        struct ptlrpc_peer *peer;
         struct list_head *tmp, *next;
         ptl_process_id_t remote_id;
         __u32 xid = 0;
@@ -208,10 +227,12 @@ int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
         if (iov == NULL)
                 RETURN (-ENOMEM);
 
+        peer = &desc->bd_connection->c_peer;
+
         desc->bd_md.start = iov;
         desc->bd_md.niov = 0;
         desc->bd_md.length = 0;
-        desc->bd_md.eventq = bulk_get_sink_eq;
+        desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_sink_eq_h;
         desc->bd_md.threshold = 2; /* SENT and REPLY */
         desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_IOV;
         desc->bd_md.user_ptr = desc;
@@ -231,10 +252,10 @@ int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
                 iov[desc->bd_md.niov].iov_base = bulk->bp_buf;
                 iov[desc->bd_md.niov].iov_len = bulk->bp_buflen;
                 if (iov[desc->bd_md.niov].iov_len <= 0) {
-                        CERROR("bad bp_buflen[%d] @ %p: %d\n", desc->bd_md.niov,
-                               bulk->bp_buf, bulk->bp_buflen);
-                        CERROR("desc: xid %u, pages %d, ptl %d, ref %d\n",
-                               xid, desc->bd_page_count, desc->bd_portal,
+                        CERROR("bad bulk %p bp_buflen[%d] @ %p: %d\n", bulk,
+                               desc->bd_md.niov, bulk->bp_buf, bulk->bp_buflen);
+                        CERROR("desc %p: xid %u, pages %d, ptl %d, ref %d\n",
+                               desc, xid, desc->bd_page_count, desc->bd_portal,
                                atomic_read(&desc->bd_refcount));
                         LBUG();
                 }
@@ -245,7 +266,7 @@ int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
         LASSERT(desc->bd_md.niov == desc->bd_page_count);
         LASSERT(desc->bd_md.niov != 0);
 
-        rc = PtlMDBind(desc->bd_connection->c_peer.peer_ni, desc->bd_md,
+        rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md,
                        &desc->bd_md_h);
 
         ptlrpc_put_bulk_iov (desc, iov); /*move down to reduce latency to send*/
@@ -259,9 +280,11 @@ int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
         remote_id.nid = desc->bd_connection->c_peer.peer_nid;
         remote_id.pid = 0;
 
-        CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d nid "LPX64" pid "
-               "%d xid %d\n", desc->bd_md.niov, desc->bd_md.length,
-               desc->bd_portal, remote_id.nid, remote_id.pid, xid);
+        CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d on %s "
+               "nid "LPX64" pid %d xid %d\n", 
+               desc->bd_md.niov, desc->bd_md.length,
+               desc->bd_portal, peer->peer_ni->pni_name,
+               remote_id.nid, remote_id.pid, xid);
 
         rc = PtlGet(desc->bd_md_h, remote_id, desc->bd_portal, 0, xid, 0);
         if (rc != PTL_OK) {
@@ -277,6 +300,7 @@ int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
 
 static int ptlrpc_register_bulk_shared(struct ptlrpc_bulk_desc *desc)
 {
+        struct ptlrpc_peer *peer;
         struct list_head *tmp, *next;
         int rc;
         __u32 xid = 0;
@@ -294,6 +318,8 @@ static int ptlrpc_register_bulk_shared(struct ptlrpc_bulk_desc *desc)
         if (iov == NULL)
                 return (-ENOMEM);
 
+        peer = &desc->bd_connection->c_peer;
+        
         desc->bd_md.start = iov;
         desc->bd_md.niov = 0;
         desc->bd_md.length = 0;
@@ -322,7 +348,7 @@ static int ptlrpc_register_bulk_shared(struct ptlrpc_bulk_desc *desc)
         source_id.nid = desc->bd_connection->c_peer.peer_nid;
         source_id.pid = PTL_PID_ANY;
 
-        rc = PtlMEAttach(desc->bd_connection->c_peer.peer_ni,
+        rc = PtlMEAttach(peer->peer_ni->pni_ni_h,
                          desc->bd_portal, source_id, xid, 0,
                          PTL_UNLINK, PTL_INS_AFTER, &desc->bd_me_h);
 
@@ -343,8 +369,8 @@ static int ptlrpc_register_bulk_shared(struct ptlrpc_bulk_desc *desc)
         ptlrpc_put_bulk_iov (desc, iov);
 
         CDEBUG(D_NET, "Setup bulk sink buffers: %u pages %u bytes, xid %u, "
-               "portal %u\n", desc->bd_md.niov, desc->bd_md.length,
-               xid, desc->bd_portal);
+               "portal %u on %s\n", desc->bd_md.niov, desc->bd_md.length,
+               xid, desc->bd_portal, peer->peer_ni->pni_name);
 
         RETURN(0);
 
@@ -358,7 +384,8 @@ static int ptlrpc_register_bulk_shared(struct ptlrpc_bulk_desc *desc)
 int ptlrpc_register_bulk_get(struct ptlrpc_bulk_desc *desc)
 {
         desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_IOV;
-        desc->bd_md.eventq = bulk_get_source_eq;
+        desc->bd_md.eventq = 
+                desc->bd_connection->c_peer.peer_ni->pni_bulk_get_source_eq_h;
 
         return ptlrpc_register_bulk_shared(desc);
 }
@@ -366,7 +393,8 @@ int ptlrpc_register_bulk_get(struct ptlrpc_bulk_desc *desc)
 int ptlrpc_register_bulk_put(struct ptlrpc_bulk_desc *desc)
 {
         desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_IOV;
-        desc->bd_md.eventq = bulk_put_sink_eq;
+        desc->bd_md.eventq = 
+                desc->bd_connection->c_peer.peer_ni->pni_bulk_put_sink_eq_h;
 
         return ptlrpc_register_bulk_shared(desc);
 }
@@ -391,6 +419,13 @@ void obd_brw_set_add(struct obd_brw_set *set, struct ptlrpc_bulk_desc *desc)
         list_add(&desc->bd_set_chain, &set->brw_desc_head);
 }
 
+void obd_brw_set_del(struct ptlrpc_bulk_desc *desc)
+{
+        atomic_dec(&desc->bd_brw_set->brw_refcount);
+        list_del_init(&desc->bd_set_chain);
+        ptlrpc_bulk_decref(desc);
+}
+
 struct obd_brw_set *obd_brw_set_new(void)
 {
         struct obd_brw_set *set;
@@ -411,11 +446,6 @@ void obd_brw_set_free(struct obd_brw_set *set)
         struct list_head *tmp, *next;
         ENTRY;
 
-        if (!list_empty(&set->brw_desc_head)) {
-                EXIT;
-                return;
-        }
-
         list_for_each_safe(tmp, next, &set->brw_desc_head) {
                 struct ptlrpc_bulk_desc *desc =
                         list_entry(tmp, struct ptlrpc_bulk_desc, bd_set_chain);
@@ -502,7 +532,7 @@ int ptl_send_rpc(struct ptlrpc_request *request)
                         }
                 }
 
-                rc = PtlMEAttach(request->rq_connection->c_peer.peer_ni,
+                rc = PtlMEAttach(request->rq_connection->c_peer.peer_ni->pni_ni_h,
                              request->rq_reply_portal,/* XXX FIXME bug 625069 */
                                  source_id, request->rq_xid, 0, PTL_UNLINK,
                                  PTL_INS_AFTER, &request->rq_reply_me_h);
@@ -517,7 +547,8 @@ int ptl_send_rpc(struct ptlrpc_request *request)
                 request->rq_reply_md.threshold = 1;
                 request->rq_reply_md.options = PTL_MD_OP_PUT;
                 request->rq_reply_md.user_ptr = request;
-                request->rq_reply_md.eventq = reply_in_eq;
+                request->rq_reply_md.eventq =
+                        request->rq_connection->c_peer.peer_ni->pni_reply_in_eq_h;
 
                 rc = PtlMDAttach(request->rq_reply_me_h, request->rq_reply_md,
                                  PTL_UNLINK, NULL);
@@ -528,14 +559,16 @@ int ptl_send_rpc(struct ptlrpc_request *request)
                 }
 
                 CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
-                       ", portal %u\n",
+                       ", portal %u on %s\n",
                        request->rq_replen, request->rq_xid,
-                       request->rq_reply_portal);
+                       request->rq_reply_portal,
+                       request->rq_connection->c_peer.peer_ni->pni_name);
         }
 
         /* Clear any flags that may be present from previous sends,
-         * except for REPLAY. */
-        request->rq_flags &= PTL_RPC_FL_REPLAY;
+         * except for REPLAY, NO_RESEND and WANT_ACK. */
+        request->rq_flags &= (PTL_RPC_FL_REPLAY | PTL_RPC_FL_NO_RESEND |
+                              PTL_RPC_FL_WANT_ACK);
         rc = ptl_send_buf(request, request->rq_connection,
                           request->rq_request_portal);
         RETURN(rc);
@@ -551,7 +584,8 @@ int ptl_send_rpc(struct ptlrpc_request *request)
 
 void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd)
 {
-        struct ptlrpc_service *service = rqbd->rqbd_service;
+        struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni;
+        struct ptlrpc_service *service = srv_ni->sni_service;
         static ptl_process_id_t match_id = {PTL_NID_ANY, PTL_PID_ANY};
         int rc;
         ptl_md_t dummy;
@@ -559,8 +593,13 @@ void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd)
 
         LASSERT(atomic_read(&rqbd->rqbd_refcount) == 0);
 
+        CDEBUG(D_NET, "PtlMEAttach: portal %d on %s h %lx.%lx\n",
+               service->srv_req_portal, srv_ni->sni_ni->pni_name,
+               srv_ni->sni_ni->pni_ni_h.nal_idx,
+               srv_ni->sni_ni->pni_ni_h.handle_idx);
+
         /* Attach the leading ME on which we build the ring */
-        rc = PtlMEAttach(service->srv_self.peer_ni, service->srv_req_portal,
+        rc = PtlMEAttach(srv_ni->sni_ni->pni_ni_h, service->srv_req_portal,
                          match_id, 0, ~0,
                          PTL_UNLINK, PTL_INS_AFTER, &rqbd->rqbd_me_h);
         if (rc != PTL_OK) {
@@ -574,9 +613,9 @@ void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd)
         dummy.threshold  = PTL_MD_THRESH_INF;
         dummy.options    = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | PTL_MD_AUTO_UNLINK;
         dummy.user_ptr   = rqbd;
-        dummy.eventq     = service->srv_eq_h;
+        dummy.eventq     = srv_ni->sni_eq_h;
 
-        atomic_inc(&service->srv_nrqbds_receiving);
+        atomic_inc(&srv_ni->sni_nrqbds_receiving);
         atomic_set(&rqbd->rqbd_refcount, 1);   /* 1 ref for portals */
 
         rc = PtlMDAttach(rqbd->rqbd_me_h, dummy, PTL_UNLINK, &md_h);
@@ -586,6 +625,6 @@ void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd)
 #warning proper cleanup required
                 PtlMEUnlink (rqbd->rqbd_me_h);
                 atomic_set(&rqbd->rqbd_refcount, 0);
-                atomic_dec(&service->srv_nrqbds_receiving);
+                atomic_dec(&srv_ni->sni_nrqbds_receiving);
         }
 }
index 10e8200..12be831 100644 (file)
@@ -23,6 +23,9 @@
  */
 
 #define DEBUG_SUBSYSTEM S_RPC
+#ifndef __KERNEL__
+#include <liblustre.h>
+#endif
 
 #include <linux/obd_support.h>
 #include <linux/lustre_net.h>
index 279c903..21cb3fe 100644 (file)
  */
 
 #define DEBUG_SUBSYSTEM S_RPC
-
+#ifndef __KERNEL__
+#include <liblustre.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#else 
 #include <linux/lustre_lite.h>
+#endif
+
 #include <linux/lustre_ha.h>
 #include <linux/obd_support.h>
 
@@ -122,9 +128,9 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
                 return;
         }
 
-        CERROR("connection %p to %s (%08x %08lx %08lx) failed\n", conn,
+        CERROR("connection %p to %s nid "LPX64" on %s failed\n", conn,
                conn->c_remote_uuid.uuid, conn->c_peer.peer_nid,
-               conn->c_peer.peer_ni.nal_idx, conn->c_peer.peer_ni.handle_idx);
+               conn->c_peer.peer_ni->pni_name);
         list_del(&rd->rd_managed_chain);
         list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
         if (rd->rd_phase != RD_IDLE) {
@@ -272,6 +278,7 @@ static int recovd_handle_event(struct recovd_obd *recovd)
         RETURN(0);
 }
 
+#ifdef __KERNEL__
 static int recovd_main(void *arg)
 {
         struct recovd_obd *recovd = (struct recovd_obd *)arg;
@@ -316,7 +323,7 @@ static int recovd_main(void *arg)
 
 int recovd_setup(struct recovd_obd *recovd)
 {
-        int rc;
+        int rc = 0; /* initialize for Liblustre */
 
         ENTRY;
 
@@ -342,6 +349,12 @@ int recovd_setup(struct recovd_obd *recovd)
 
         RETURN(0);
 }
+#else 
+int recovd_setup(struct recovd_obd *recovd)
+{
+        return 0;
+}
+#endif
 
 int recovd_cleanup(struct recovd_obd *recovd)
 {
index 1c99fed..a1464a3 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define DEBUG_SUBSYSTEM S_RPC
+#ifdef __KERNEL__
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/kmod.h>
-
-#define DEBUG_SUBSYSTEM S_RPC
+#else 
+#include <liblustre.h>
+#endif
 
 #include <linux/lustre_ha.h>
 #include <linux/lustre_net.h>
@@ -230,11 +233,6 @@ int ptlrpc_resend(struct obd_import *imp)
         ENTRY;
 
         spin_lock_irqsave(&imp->imp_lock, flags);
-        list_for_each(tmp, &imp->imp_sending_list) {
-                req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                DEBUG_REQ(D_HA, req, "SENDING: ");
-        }
-
         list_for_each_safe(tmp, pos, &imp->imp_sending_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 
@@ -243,12 +241,10 @@ int ptlrpc_resend(struct obd_import *imp)
                         break;
 
                     case RESTART:
-                        DEBUG_REQ(D_HA, req, "RESTART:");
                         ptlrpc_restart_req(req);
                         break;
 
                     case RESEND_IGNORE:
-                        DEBUG_REQ(D_HA, req, "RESEND_IGNORE:");
                         rc = ptlrpc_replay_req(req);
                         if (rc) {
                                 DEBUG_REQ(D_ERROR, req, "error %d resending:",
@@ -258,7 +254,6 @@ int ptlrpc_resend(struct obd_import *imp)
                         break;
 
                     case RESEND:
-                        DEBUG_REQ(D_HA, req, "RESEND:");
                         ptlrpc_resend_req(req);
                         break;
 
index 95fe7ec..0f13acf 100644 (file)
 #define EXPORT_SYMTAB
 #define DEBUG_SUBSYSTEM S_RPC
 
-#include <linux/module.h>
+#ifdef __KERNEL__
+# include <linux/module.h>
+# include <linux/init.h>
+#else
+# include <liblustre.h>
+#endif
+#include <linux/obd.h>
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
 #include <linux/lustre_lib.h>
 #include <linux/lustre_ha.h>
 #include <linux/lustre_net.h>
-#include <linux/init.h>
 #include <linux/lprocfs_status.h>
 
 extern int ptlrpc_init_portals(void);
 extern void ptlrpc_exit_portals(void);
 
+static __u32 ptlrpc_last_xid = 0;
+static spinlock_t ptlrpc_last_xid_lock = SPIN_LOCK_UNLOCKED;
+
+__u32 ptlrpc_next_xid(void)
+{
+        __u32 tmp;
+        spin_lock(&ptlrpc_last_xid_lock);
+        tmp = ++ptlrpc_last_xid;
+        spin_unlock(&ptlrpc_last_xid_lock);
+        return tmp;
+}
 
 int connmgr_setup(struct obd_device *obddev, obd_count len, void *buf)
 {
@@ -96,7 +112,8 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len,
                         LASSERT(conn->c_recovd_data.rd_recovd == recovd);
 
 #warning check buffer overflow in next line
-                        if (!strcmp(conn->c_remote_uuid.uuid, data->ioc_inlbuf1))
+                        if (!strcmp(conn->c_remote_uuid.uuid,
+                                    data->ioc_inlbuf1))
                                 break;
                         conn = NULL;
                 }
@@ -154,9 +171,11 @@ static int connmgr_connect(struct lustre_handle *conn, struct obd_device *src,
 int connmgr_attach(struct obd_device *dev, obd_count len, void *data)
 {
         struct lprocfs_static_vars lvars;
+        int rc = 0;
 
         lprocfs_init_vars(&lvars);
-        return lprocfs_obd_attach(dev, lvars.obd_vars);
+        rc = lprocfs_obd_attach(dev, lvars.obd_vars);
+        return rc;
 }
 
 int conmgr_detach(struct obd_device *dev)
@@ -176,7 +195,9 @@ static struct obd_ops recovd_obd_ops = {
         o_disconnect:   class_disconnect
 };
 
-static int __init ptlrpc_init(void)
+
+
+__init int ptlrpc_init(void)
 {
         struct lprocfs_static_vars lvars;
         int rc;
@@ -204,6 +225,9 @@ static void __exit ptlrpc_exit(void)
         ptlrpc_cleanup_connection();
 }
 
+/* rpc.c */
+EXPORT_SYMBOL(ptlrpc_next_xid);
+
 /* recovd.c */
 EXPORT_SYMBOL(ptlrpc_recovd);
 EXPORT_SYMBOL(recovd_conn_fail);
@@ -234,6 +258,7 @@ EXPORT_SYMBOL(ptlrpc_link_svc_me);
 EXPORT_SYMBOL(obd_brw_set_free);
 EXPORT_SYMBOL(obd_brw_set_new);
 EXPORT_SYMBOL(obd_brw_set_add);
+EXPORT_SYMBOL(obd_brw_set_del);
 
 /* client.c */
 EXPORT_SYMBOL(ptlrpc_init_client);
@@ -246,6 +271,7 @@ EXPORT_SYMBOL(ptlrpc_replay_req);
 EXPORT_SYMBOL(ptlrpc_restart_req);
 EXPORT_SYMBOL(ptlrpc_prep_req);
 EXPORT_SYMBOL(ptlrpc_free_req);
+EXPORT_SYMBOL(ptlrpc_abort);
 EXPORT_SYMBOL(ptlrpc_req_finished);
 EXPORT_SYMBOL(ptlrpc_request_addref);
 EXPORT_SYMBOL(ptlrpc_prep_bulk);
@@ -275,9 +301,11 @@ EXPORT_SYMBOL(ptlrpc_replay);
 EXPORT_SYMBOL(ptlrpc_resend);
 EXPORT_SYMBOL(ptlrpc_wake_delayed);
 
+#ifdef __KERNEL__
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Lustre Request Processor");
 MODULE_LICENSE("GPL");
 
 module_init(ptlrpc_init);
 module_exit(ptlrpc_exit);
+#endif
index 0ea29b3..112d01d 100644 (file)
  */
 
 #define DEBUG_SUBSYSTEM S_RPC
-
+#ifndef __KERNEL__
+#include <liblustre.h>
+#include <linux/kp30.h>
+#endif
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
 #include <linux/lustre_net.h>
@@ -31,6 +34,9 @@ extern int request_in_callback(ptl_event_t *ev);
 static int ptlrpc_check_event(struct ptlrpc_service *svc,
                               struct ptlrpc_thread *thread, ptl_event_t *event)
 {
+        struct ptlrpc_srv_ni *srv_ni;
+        int i;
+        int idx;
         int rc;
         ENTRY;
 
@@ -40,22 +46,32 @@ static int ptlrpc_check_event(struct ptlrpc_service *svc,
                 GOTO(out, rc = 1);
 
         LASSERT ((thread->t_flags & SVC_EVENT) == 0);
-        LASSERT (ptl_is_valid_handle (&svc->srv_eq_h));
+        LASSERT (ptlrpc_ninterfaces > 0);
 
-        rc = PtlEQGet(svc->srv_eq_h, event);
-        switch (rc)
-        {
-        case PTL_OK:
-                thread->t_flags |= SVC_EVENT;
-                GOTO(out, rc = 1);
+        for (i = 0; i < ptlrpc_ninterfaces; i++) {
+                idx = (svc->srv_interface_rover + i) % ptlrpc_ninterfaces;
+                srv_ni = &svc->srv_interfaces[idx];
 
-        case PTL_EQ_EMPTY:
-                GOTO(out, rc = 0);
+                LASSERT (ptl_is_valid_handle (&srv_ni->sni_eq_h));
 
-        default:
-                CERROR("BUG: PtlEQGet returned %d\n", rc);
-                LBUG();
+                rc = PtlEQGet(srv_ni->sni_eq_h, event);
+                switch (rc)
+                {
+                case PTL_OK:
+                        /* next time start with the next interface */
+                        svc->srv_interface_rover = (idx+1) % ptlrpc_ninterfaces;
+                        thread->t_flags |= SVC_EVENT;
+                        GOTO(out, rc = 1);
+
+                case PTL_EQ_EMPTY:
+                        continue;
+
+                default:
+                        CERROR("BUG: PtlEQGet returned %d\n", rc);
+                        LBUG();
+                }
         }
+        rc = 0;
  out:
         spin_unlock(&svc->srv_lock);
         return rc;
@@ -65,15 +81,22 @@ struct ptlrpc_service *
 ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
                 __u32 bufsize, __u32 max_req_size,
                 int req_portal, int rep_portal,
-                struct obd_uuid *uuid, svc_handler_t handler, char *name)
+                svc_handler_t handler, char *name)
 {
-        int err;
-        int rc, i;
+        int ssize;
+        int rc;
+        int i;
+        int j;
         struct ptlrpc_service *service;
+        struct ptlrpc_srv_ni  *srv_ni;
         ENTRY;
 
-        OBD_ALLOC(service, sizeof(*service));
-        if (!service)
+        LASSERT (ptlrpc_ninterfaces > 0);
+
+        ssize = offsetof (struct ptlrpc_service,
+                          srv_interfaces[ptlrpc_ninterfaces]);
+        OBD_ALLOC(service, ssize);
+        if (service == NULL)
                 RETURN(NULL);
 
         service->srv_name = name;
@@ -83,54 +106,73 @@ ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
 
         service->srv_max_req_size = max_req_size;
         service->srv_buf_size = bufsize;
-        INIT_LIST_HEAD(&service->srv_rqbds);
-        service->srv_nrqbds = 0;
-        atomic_set(&service->srv_nrqbds_receiving, 0);
 
         service->srv_rep_portal = rep_portal;
         service->srv_req_portal = req_portal;
         service->srv_handler = handler;
-
-        err = kportal_uuid_to_peer(uuid->uuid, &service->srv_self);
-        if (err) {
-                CERROR("%s: cannot get peer for uuid '%s'\n", name, 
-                       uuid->uuid);
-                OBD_FREE(service, sizeof(*service));
-                RETURN(NULL);
-        }
-
-        rc = PtlEQAlloc(service->srv_self.peer_ni, nevents,
-                        request_in_callback, &(service->srv_eq_h));
-
-        if (rc != PTL_OK) {
-                CERROR("%s: PtlEQAlloc failed: %d\n", name, rc);
-                OBD_FREE(service, sizeof(*service));
-                RETURN(NULL);
+        service->srv_interface_rover = 0;
+
+        /* First initialise enough for early teardown */
+        for (i = 0; i < ptlrpc_ninterfaces; i++) {
+                srv_ni = &service->srv_interfaces[i];
+
+                srv_ni->sni_service = service;
+                srv_ni->sni_ni = &ptlrpc_interfaces[i];
+                ptl_set_inv_handle (&srv_ni->sni_eq_h);
+                INIT_LIST_HEAD(&srv_ni->sni_rqbds);
+                srv_ni->sni_nrqbds = 0;
+                atomic_set(&srv_ni->sni_nrqbds_receiving, 0);
         }
 
-        for (i = 0; i < nbufs; i++) {
-                struct ptlrpc_request_buffer_desc *rqbd;
-
-                OBD_ALLOC(rqbd, sizeof(*rqbd));
-                if (rqbd == NULL)
-                        GOTO(failed, NULL);
-
-                rqbd->rqbd_service = service;
-                ptl_set_inv_handle(&rqbd->rqbd_me_h);
-                atomic_set(&rqbd->rqbd_refcount, 0);
-                OBD_ALLOC(rqbd->rqbd_buffer, service->srv_buf_size);
-                if (rqbd->rqbd_buffer == NULL) {
-                        OBD_FREE(rqbd, sizeof(*rqbd));
-                        GOTO(failed, NULL);
+        /* Now allocate the event queue and request buffers, assuming all
+         * interfaces require the same level of buffering. */
+        for (i = 0; i < ptlrpc_ninterfaces; i++) {
+                srv_ni = &service->srv_interfaces[i];
+                CDEBUG (D_NET, "%s: initialising interface %s\n", name,
+                        srv_ni->sni_ni->pni_name);
+
+                rc = PtlEQAlloc(srv_ni->sni_ni->pni_ni_h, nevents,
+                                request_in_callback, &(srv_ni->sni_eq_h));
+                if (rc != PTL_OK) {
+                        CERROR("%s.%d: PtlEQAlloc on %s failed: %d\n",
+                               name, i, srv_ni->sni_ni->pni_name, rc);
+                        GOTO (failed, NULL);
                 }
-                list_add(&rqbd->rqbd_list, &service->srv_rqbds);
-                service->srv_nrqbds++;
 
-                ptlrpc_link_svc_me(rqbd);
+                for (j = 0; j < nbufs; j++) {
+                        struct ptlrpc_request_buffer_desc *rqbd;
+
+                        OBD_ALLOC(rqbd, sizeof(*rqbd));
+                        if (rqbd == NULL) {
+                                CERROR ("%s.%d: Can't allocate request "
+                                        "descriptor %d on %s\n",
+                                        name, i, srv_ni->sni_nrqbds,
+                                        srv_ni->sni_ni->pni_name);
+                                GOTO(failed, NULL);
+                        }
+
+                        rqbd->rqbd_srv_ni = srv_ni;
+                        ptl_set_inv_handle(&rqbd->rqbd_me_h);
+                        atomic_set(&rqbd->rqbd_refcount, 0);
+
+                        OBD_ALLOC(rqbd->rqbd_buffer, service->srv_buf_size);
+                        if (rqbd->rqbd_buffer == NULL) {
+                                CERROR ("%s.%d: Can't allocate request "
+                                        "buffer %d on %s\n",
+                                        name, i, srv_ni->sni_nrqbds,
+                                        srv_ni->sni_ni->pni_name);
+                                OBD_FREE(rqbd, sizeof(*rqbd));
+                                GOTO(failed, NULL);
+                        }
+                        list_add(&rqbd->rqbd_list, &srv_ni->sni_rqbds);
+                        srv_ni->sni_nrqbds++;
+
+                        ptlrpc_link_svc_me(rqbd);
+                }
         }
 
-        CDEBUG(D_NET, "Starting service listening on portal %d (eq: %lu)\n",
-               service->srv_req_portal, service->srv_eq_h.handle_idx);
+        CDEBUG(D_NET, "%s: Started on %d interfaces, listening on portal %d\n",
+               service->srv_name, ptlrpc_ninterfaces, service->srv_req_portal);
 
         RETURN(service);
 failed:
@@ -151,11 +193,12 @@ static int handle_incoming_request(struct obd_device *obddev,
 
         LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0);
         LASSERT ((event->mem_desc.options & PTL_MD_IOV) == 0);
-        LASSERT (rqbd->rqbd_service == svc);
+        LASSERT (rqbd->rqbd_srv_ni->sni_service == svc);
         LASSERT (rqbd->rqbd_buffer == event->mem_desc.start);
         LASSERT (event->offset + event->mlength <= svc->srv_buf_size);
 
         memset(request, 0, sizeof(*request));
+        INIT_LIST_HEAD(&request->rq_list);
         request->rq_svc = svc;
         request->rq_obd = obddev;
         request->rq_xid = event->match_bits;
@@ -172,11 +215,10 @@ static int handle_incoming_request(struct obd_device *obddev,
                 goto out;
         }
 
-        CDEBUG(D_RPCTRACE, "Handling RPC pid:xid:nid:opc %d:"LPU64":"LPX64":%d\n",
-               NTOH__u32(request->rq_reqmsg->status),
-               request->rq_xid,
-               event->initiator.nid,
-               NTOH__u32(request->rq_reqmsg->opc));
+        CDEBUG(D_RPCTRACE, "Handling RPC ni:pid:xid:nid:opc %d:%d:"LPU64":"
+               LPX64":%d\n", rqbd->rqbd_srv_ni - &svc->srv_interfaces[0],
+               NTOH__u32(request->rq_reqmsg->status), request->rq_xid,
+               event->initiator.nid, NTOH__u32(request->rq_reqmsg->opc));
 
         if (NTOH__u32(request->rq_reqmsg->type) != PTL_RPC_MSG_REQUEST) {
                 CERROR("wrong packet type received (type=%u)\n",
@@ -204,9 +246,7 @@ static int handle_incoming_request(struct obd_device *obddev,
                event->mem_desc.start, event->offset);
 
         request->rq_peer.peer_nid = event->initiator.nid;
-        /* FIXME: this NI should be the incoming NI.
-         * We don't know how to find that from here. */
-        request->rq_peer.peer_ni = svc->srv_self.peer_ni;
+        request->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni;
 
         request->rq_export = class_conn2export((struct lustre_handle *)
                                                request->rq_reqmsg);
@@ -271,7 +311,9 @@ static int ptlrpc_main(void *arg)
 #endif
 
 #ifdef __arch_um__
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         sprintf(current->comm, "%s|%d", data->name,current->thread.extern_pid);
+#endif
 #else
         strcpy(current->comm, data->name);
 #endif
@@ -399,7 +441,9 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
 
 int ptlrpc_unregister_service(struct ptlrpc_service *service)
 {
+        int i;
         int rc;
+        struct ptlrpc_srv_ni *srv_ni;
 
         LASSERT (list_empty (&service->srv_threads));
 
@@ -408,39 +452,50 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
          * freeing them.
          */
 
-        while (!list_empty (&service->srv_rqbds)) {
-                struct ptlrpc_request_buffer_desc *rqbd =
-                        list_entry (service->srv_rqbds.next,
-                                    struct ptlrpc_request_buffer_desc,
-                                    rqbd_list);
-
-                list_del (&rqbd->rqbd_list);
-
-                LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0);
-                /* refcount could be anything; it's possible for the
-                 * buffers to continued to get filled after all the server
-                 * threads exited.  But we know they _have_ exited.
-                 */
-
-                (void) PtlMEUnlink(rqbd->rqbd_me_h);
-                /* The callback handler could have unlinked this ME already
-                 * (we're racing with her) but it's safe to ensure it _has_
-                 * been unlinked.
-                 */
-
-                OBD_FREE (rqbd->rqbd_buffer, service->srv_buf_size);
-                OBD_FREE (rqbd, sizeof (*rqbd));
-                service->srv_nrqbds--;
-        }
+        for (i = 0; i < ptlrpc_ninterfaces; i++) {
+                srv_ni = &service->srv_interfaces[i];
+                CDEBUG (D_NET, "%s: tearing down interface %s\n",
+                        service->srv_name, srv_ni->sni_ni->pni_name);
+
+                while (!list_empty (&srv_ni->sni_rqbds)) {
+                        struct ptlrpc_request_buffer_desc *rqbd =
+                                list_entry (srv_ni->sni_rqbds.next,
+                                            struct ptlrpc_request_buffer_desc,
+                                            rqbd_list);
+
+                        list_del (&rqbd->rqbd_list);
+
+                        LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0);
+                        /* refcount could be anything; it's possible for
+                         * the buffers to continued to get filled after all
+                         * the server threads exited.  But we know they
+                         * _have_ exited.
+                         */
+
+                        (void) PtlMEUnlink(rqbd->rqbd_me_h);
+                        /* The callback handler could have unlinked this ME
+                         * already (we're racing with her) but it's safe to
+                         * ensure it _has_ been unlinked.
+                         */
+
+                        OBD_FREE (rqbd->rqbd_buffer, service->srv_buf_size);
+                        OBD_FREE (rqbd, sizeof (*rqbd));
+                        srv_ni->sni_nrqbds--;
+                }
 
-        LASSERT (service->srv_nrqbds == 0);
+                LASSERT (srv_ni->sni_nrqbds == 0);
 
-        rc = PtlEQFree(service->srv_eq_h);
-        if (rc)
-                CERROR("PtlEQFree failed: %d\n", rc);
+                if (ptl_is_valid_handle (&srv_ni->sni_eq_h)) {
+                        rc = PtlEQFree(srv_ni->sni_eq_h);
+                        if (rc)
+                                CERROR("%s.%d: PtlEQFree failed on %s: %d\n",
+                                       service->srv_name, i,
+                                       srv_ni->sni_ni->pni_name, rc);
+                }
+        }
 
-        OBD_FREE(service, sizeof(*service));
-        if (rc)
-                LBUG();
-        return rc;
+        OBD_FREE(service,
+                 offsetof (struct ptlrpc_service,
+                           srv_interfaces[ptlrpc_ninterfaces]));
+        return 0;
 }
index fe700f1..6b40c41 100644 (file)
@@ -4,7 +4,7 @@
 %define linuxdir @LINUX@
 %define portalsdir @PORTALS@
 %define portalslibdir @PORTALSLIB@
-Release: 0301070810ltutor3
+Release: 0302240920chaos
 
 Summary: Lustre Lite File System
 Name: lustre-lite
@@ -51,20 +51,40 @@ Requires: openldap-servers, openldap-clients, python-ldap, 4Suite
 %description -n lustre-ldap
 Configures openldap server for LDAP Lustre config database
 
+
+%package -n liblustre
+Summary: Lustre Lib
+Group: Development/Kernel
+
+%description -n liblustre
+Lustre lib binary package.
+
+
 %prep
 %setup -qn lustre-%{version}
+%setup -c -n lustre-%{version}-lib
 
 %build
 rm -rf $RPM_BUILD_ROOT
 
 # Set an explicit path to our Linux tree, if we can.
+cd $RPM_BUILD_DIR/lustre-%{version}
 ./configure --with-linux='%{linuxdir}' --with-portals='%{portalsdir}' --with-portalslib='%{portalslibdir}'
 make
+cd $RPM_BUILD_DIR/lustre-%{version}-lib/lustre-%{version}
+./configure --with-lib --with-portals='%{portalsdir}' --with-portalslib='%{portalslibdir}'
+make
 
 %install
+cd $RPM_BUILD_DIR/lustre-%{version}
+make install prefix=$RPM_BUILD_ROOT
+
+cd $RPM_BUILD_DIR/lustre-%{version}-lib/lustre-%{version}
 make install prefix=$RPM_BUILD_ROOT
 
+
 # Create the pristine source directory.
+cd $RPM_BUILD_DIR/lustre-%{version}
 mkdir -p $RPM_BUILD_ROOT/usr/src
 rm -f lustre-source
 ln -s $RPM_BUILD_ROOT/usr/src lustre-source
@@ -120,6 +140,26 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre
 %files -n lustre-source
 %attr(-, root, root) /usr/src/lustre-%{version}
 
+%files -n liblustre
+%attr(-, root, root) /lib/lustre
+%attr(-, root, root) /lib/lustre/liblov.a
+%attr(-, root, root) /lib/lustre/liblustreclass.a
+%attr(-, root, root) /lib/lustre/libptlrpc.a
+%attr(-, root, root) /lib/lustre/libobdecho.a
+%attr(-, root, root) /lib/lustre/libldlm.a
+%attr(-, root, root) /lib/lustre/libosc.a
+%attr(-, root, root) /usr/sbin/lctl
+%attr(-, root, root) /usr/sbin/lfind
+%attr(-, root, root) /usr/sbin/lstripe
+%attr(-, root, root) /usr/sbin/obdio
+%attr(-, root, root) /usr/sbin/obdbarrier
+%attr(-, root, root) /usr/sbin/obdstat
+%attr(-, root, root) /usr/sbin/lload
+%attr(-, root, root) /usr/sbin/lconf
+%attr(-, root, root) /usr/sbin/lmc
+%attr(-, root, root) /usr/sbin/llanalyze
+
+
 %files -n lustre-ldap
 %attr(-, root, root) /etc/openldap/slapd-lustre.conf
 %attr(-, root, root) /etc/openldap/schema/lustre.schema
index 239e0fd..7a18486 100644 (file)
@@ -14,6 +14,7 @@ openme
 writeme
 mcreate
 munlink
+mlink
 tchmod
 toexcl
 fsx
@@ -33,3 +34,4 @@ checkstat
 wantedi
 createtest
 open_delay
+statone
index 116bf8d..628c46b 100644 (file)
@@ -9,7 +9,7 @@ EXTRA_DIST = $(pkgexample_SCRIPTS) $(noinst_SCRIPTS) $(noinst_DATA) \
        client-echo.cfg    elan-server.cfg  net-client.cfg  obdecho.cfg \
        client-mount.cfg   ldlm.cfg         net-local.cfg   obdfilter.cfg \
        client-mount2.cfg  lustre.cfg       net-server.cfg  sanity.sh \
-       rundbench          \
+       rundbench          mcreate \
        elan-client.cfg    mds.cfg      trivial.sh
 pkgexampledir = '${exec_prefix}/usr/lib/$(PACKAGE)/examples'
 pkgexample_SCRIPTS = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh local.sh echo.sh uml.sh lov.sh
@@ -24,9 +24,9 @@ noinst_SCRIPTS += fs.sh intent-test.sh intent-test2.sh leak_finder.pl \
        runtests runvmstat snaprun.sh tbox.sh  common.sh
 noinst_PROGRAMS = openunlink testreq truncate directio openme writeme open_delay
 noinst_PROGRAMS += munlink tchmod toexcl fsx test_brw openclose createdestroy
-noinst_PROGRAMS += stat createmany statmany multifstat createtest
+noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink
 # noinst_PROGRAMS += ldaptest 
-noinst_PROGRAMS += checkstat wantedi
+noinst_PROGRAMS += checkstat wantedi statone
 sbin_PROGRAMS = mcreate mkdirmany
 
 # ldaptest_SOURCES = ldaptest.c
@@ -35,6 +35,7 @@ toexcl_SOURCES = toexcl.c
 testreq_SOURCES = testreq.c
 mcreate_SOURCES = mcreate.c
 munlink_SOURCES = munlink.c
+mlink_SOURCES = mlink.c
 truncate_SOURCES = truncate.c
 directio_SOURCES = directio.c
 openunlink_SOURCES = openunlink.c
@@ -47,6 +48,7 @@ createdestroy_SOURCES = createdestroy.c
 stat_SOURCES = stat.c
 createmany_SOURCES = createmany.c
 statmany_SOURCES = statmany.c
+statone_SOURCES = statone.c
 mkdirmany_SOURCES = mkdirmany.c
 multifstat_SOURCES = multifstat.c
 checkstat_SOURCES = checkstat.c
diff --git a/lustre/tests/acceptance-metadata-double.sh b/lustre/tests/acceptance-metadata-double.sh
new file mode 100644 (file)
index 0000000..f647a55
--- /dev/null
@@ -0,0 +1,140 @@
+#!/bin/sh
+set -e
+
+#
+# Runs create.pl and rename.pl on two mountpoints with increasing load, varying
+# debug levels.  Assumes that the node is already setup with llmount2.sh
+#
+
+SRCDIR="`dirname $0`"
+CREATE=$SRCDIR/create.pl
+
+debug_client_on()
+{
+       echo -1 > /proc/sys/portals/debug
+}
+
+debug_client_off()
+{
+       echo 0 > /proc/sys/portals/debug
+}
+
+MNT=${MNT:-/mnt/lustre}
+
+debug_client_on
+echo "create.pl, 2 mounts, 1 thread, 10 ops, debug on"
+perl $CREATE -- $MNT 2 10
+echo "create.pl, 2 mounts, 1 thread, 100 ops, debug on"
+perl $CREATE --silent -- $MNT 2 100
+echo "create.pl --mcreate=0, 2 mounts, 1 thread, 10 ops, debug on"
+perl $CREATE --mcreate=0 -- $MNT 2 10
+echo "create.pl --mcreate=0, 2 mounts, 1 thread, 100 ops, debug on"
+perl $CREATE --mcreate=0 --silent -- $MNT 2 100
+echo "rename.pl, 2 mounts, 1 thread, 10 ops, debug on"
+perl rename.pl --count=2 $MNT 10
+echo "rename.pl, 2 mounts, 1 thread, 100 ops, debug on"
+perl rename.pl --count=2 --silent $MNT 100
+
+debug_client_off
+echo "create.pl, 2 mounts, 1 thread, 1000 ops, debug off"
+perl $CREATE --silent -- $MNT 2 1000
+echo "create.pl --mcreate=0, 2 mounts, 1 thread, 1000 ops, debug off"
+perl $CREATE --silent --mcreate=0 -- $MNT 2 1000
+echo "rename.pl, 2 mounts, 1 thread, 1000 ops, debug off"
+perl rename.pl --count=2 --silent $MNT 1000
+
+debug_client_on
+echo "create.pl, 2 mounts, 2 threads, 100 ops, debug on"
+perl $CREATE --silent -- $MNT 2 100 &
+perl $CREATE --silent -- $MNT 2 100 &
+wait
+echo "create.pl --mcreate=0, 2 mounts, 2 threads, 100 ops, debug on"
+perl $CREATE --silent --mcreate=0 -- $MNT 2 100 &
+perl $CREATE --silent --mcreate=0 -- $MNT 2 100 &
+wait
+echo "rename.pl, 2 mounts, 2 thread, 1000 ops, debug on"
+perl rename.pl --count=2 --silent $MNT 1000 &
+perl rename.pl --count=2 --silent $MNT 1000 &
+wait
+
+debug_client_off
+echo "create.pl, 2 mounts, 2 threads, 2000 ops, debug off"
+perl $CREATE --silent -- $MNT 2 2000 &
+perl $CREATE --silent -- $MNT 2 2000 &
+wait
+echo "create.pl --mcreate=0, 2 mounts, 2 threads, 2000 ops, debug off"
+perl $CREATE --silent --mcreate=0 -- $MNT 2 2000 &
+perl $CREATE --silent --mcreate=0 -- $MNT 2 2000 &
+wait
+echo "rename.pl, 2 mounts, 2 threads, 2000 ops, debug off"
+perl rename.pl --count=2 --silent $MNT 2000 &
+perl rename.pl --count=2 --silent $MNT 2000 &
+wait
+
+debug_client_on
+echo "create.pl, 2 mounts, 4 threads, 100 ops, debug on"
+for i in `seq 1 4`; do
+  perl $CREATE --silent -- $MNT 2 100 &
+done
+wait
+echo "create.pl --mcreate=0, 2 mounts, 4 threads, 100 ops, debug on"
+for i in `seq 1 4`; do
+  perl $CREATE --silent --mcreate=0 -- $MNT 2 100 &
+done
+wait
+echo "rename.pl, 2 mounts, 4 threads, 2000 ops, debug on"
+for i in `seq 1 4`; do
+  perl rename.pl --count=2 --silent $MNT 2000 &
+done
+wait
+
+debug_client_off
+echo "create.pl, 2 mounts, 4 threads, 2000 ops, debug off"
+for i in `seq 1 4`; do
+  perl $CREATE --silent -- $MNT 2 2000 &
+done
+wait
+echo "create.pl --mcreate=0, 2 mounts, 4 threads, 2000 ops, debug off"
+for i in `seq 1 4`; do
+  perl $CREATE --silent --mcreate=0 -- $MNT 2 2000 &
+done
+wait
+echo "rename.pl, 2 mounts, 4 threads, 2000 ops, debug off"
+for i in `seq 1 4`; do
+  perl rename.pl --count=2 --silent $MNT 2000 &
+done
+wait
+
+debug_client_on
+echo "create.pl, 2 mounts, 8 threads, 500 ops, debug on"
+for i in `seq 1 8`; do
+  perl $CREATE --silent -- $MNT 2 500 &
+done
+wait
+echo "create.pl --mcreate=0, 2 mounts, 8 threads, 500 ops, debug on"
+for i in `seq 1 8`; do
+  perl $CREATE --silent --mcreate=0 -- $MNT 2 500 &
+done
+wait
+echo "rename.pl, 2 mounts, 8 threads, 2000 ops, debug on"
+for i in `seq 1 8`; do
+  perl rename.pl --count=2 --silent $MNT 2000 &
+done
+wait
+
+debug_client_off
+echo "create.pl, 2 mounts, 8 threads, 2000 ops, debug off"
+for i in `seq 1 8`; do
+  perl $CREATE --silent -- $MNT 2 2000 &
+done
+wait
+echo "create.pl --mcreate=0, 2 mounts, 8 threads, 2000 ops, debug off"
+for i in `seq 1 8`; do
+  perl $CREATE --silent --mcreate=0 -- $MNT 2 2000 &
+done
+wait
+echo "rename.pl, 2 mounts, 8 threads, 2000 ops, debug off"
+for i in `seq 1 8`; do
+  perl rename.pl --count=2 --silent $MNT 2000 &
+done
+wait
index 501d2be..53774e5 100644 (file)
@@ -6,20 +6,30 @@ set -e
 # load, varying debug levels
 #
 
-SRCDIR="`dirname $0`/"
-. $SRCDIR/common.sh
+SRCDIR="`dirname $0`"
+CREATE=$SRCDIR/create.pl
+
+debug_client_on()
+{
+       echo -1 > /proc/sys/portals/debug
+}
+
+debug_client_off()
+{
+       echo 0 > /proc/sys/portals/debug
+}
 
 MNT=${MNT:-/mnt/lustre}
 
 debug_client_on
 echo "create.pl, 1 mount, 1 thread, 10 ops, debug on"
-perl create.pl -- $MNT -1 10
+perl $CREATE -- $MNT -1 10
 echo "create.pl, 1 mount, 1 thread, 100 ops, debug on"
-perl create.pl --silent -- $MNT -1 100
+perl $CREATE --silent -- $MNT -1 100
 echo "create.pl --mcreate=0, 1 mount, 1 thread, 10 ops, debug on"
-perl create.pl --mcreate=0 -- $MNT -1 10
+perl $CREATE --mcreate=0 -- $MNT -1 10
 echo "create.pl --mcreate=0, 1 mount, 1 thread, 100 ops, debug on"
-perl create.pl --mcreate=0 --silent -- $MNT -1 100
+perl $CREATE --mcreate=0 --silent -- $MNT -1 100
 echo "rename.pl, 1 mount, 1 thread, 10 ops, debug on"
 perl rename.pl $MNT 10
 echo "rename.pl, 1 mount, 1 thread, 100 ops, debug on"
@@ -27,20 +37,20 @@ perl rename.pl --silent $MNT 100
 
 debug_client_off
 echo "create.pl, 1 mount, 1 thread, 1000 ops, debug off"
-perl create.pl --silent -- $MNT -1 1000
+perl $CREATE --silent -- $MNT -1 1000
 echo "create.pl --mcreate=0, 1 mount, 1 thread, 1000 ops, debug off"
-perl create.pl --silent --mcreate=0 -- $MNT -1 1000
+perl $CREATE --silent --mcreate=0 -- $MNT -1 1000
 echo "rename.pl, 1 mount, 1 thread, 1000 ops, debug off"
 perl rename.pl --silent $MNT 1000
 
 debug_client_on
 echo "create.pl, 1 mount, 2 threads, 100 ops, debug on"
-perl create.pl --silent -- $MNT -1 100 &
-perl create.pl --silent -- $MNT -1 100 &
+perl $CREATE --silent -- $MNT -1 100 &
+perl $CREATE --silent -- $MNT -1 100 &
 wait
 echo "create.pl --mcreate=0, 1 mount, 2 threads, 100 ops, debug on"
-perl create.pl --silent --mcreate=0 -- $MNT -1 100 &
-perl create.pl --silent --mcreate=0 -- $MNT -1 100 &
+perl $CREATE --silent --mcreate=0 -- $MNT -1 100 &
+perl $CREATE --silent --mcreate=0 -- $MNT -1 100 &
 wait
 echo "rename.pl, 1 mount, 2 thread, 1000 ops, debug on"
 perl rename.pl --silent $MNT 1000 &
@@ -49,12 +59,12 @@ wait
 
 debug_client_off
 echo "create.pl, 1 mount, 2 threads, 2000 ops, debug off"
-perl create.pl --silent -- $MNT -1 2000 &
-perl create.pl --silent -- $MNT -1 2000 &
+perl $CREATE --silent -- $MNT -1 2000 &
+perl $CREATE --silent -- $MNT -1 2000 &
 wait
 echo "create.pl --mcreate=0, 1 mount, 2 threads, 2000 ops, debug off"
-perl create.pl --silent --mcreate=0 -- $MNT -1 2000 &
-perl create.pl --silent --mcreate=0 -- $MNT -1 2000 &
+perl $CREATE --silent --mcreate=0 -- $MNT -1 2000 &
+perl $CREATE --silent --mcreate=0 -- $MNT -1 2000 &
 wait
 echo "rename.pl, 1 mount, 2 threads, 2000 ops, debug off"
 perl rename.pl --silent $MNT 2000 &
@@ -64,12 +74,12 @@ wait
 debug_client_on
 echo "create.pl, 1 mount, 4 threads, 100 ops, debug on"
 for i in `seq 1 4`; do
-  perl create.pl --silent -- $MNT -1 100 &
+  perl $CREATE --silent -- $MNT -1 100 &
 done
 wait
 echo "create.pl --mcreate=0, 1 mount, 4 threads, 100 ops, debug on"
 for i in `seq 1 4`; do
-  perl create.pl --silent --mcreate=0 -- $MNT -1 100 &
+  perl $CREATE --silent --mcreate=0 -- $MNT -1 100 &
 done
 wait
 echo "rename.pl, 1 mount, 4 threads, 2000 ops, debug on"
@@ -81,12 +91,12 @@ wait
 debug_client_off
 echo "create.pl, 1 mount, 4 threads, 2000 ops, debug off"
 for i in `seq 1 4`; do
-  perl create.pl --silent -- $MNT -1 2000 &
+  perl $CREATE --silent -- $MNT -1 2000 &
 done
 wait
 echo "create.pl --mcreate=0, 1 mount, 4 threads, 2000 ops, debug off"
 for i in `seq 1 4`; do
-  perl create.pl --silent --mcreate=0 -- $MNT -1 2000 &
+  perl $CREATE --silent --mcreate=0 -- $MNT -1 2000 &
 done
 wait
 echo "rename.pl, 1 mount, 4 threads, 2000 ops, debug off"
@@ -98,12 +108,12 @@ wait
 debug_client_on
 echo "create.pl, 1 mount, 8 threads, 500 ops, debug on"
 for i in `seq 1 8`; do
-  perl create.pl --silent -- $MNT -1 500 &
+  perl $CREATE --silent -- $MNT -1 500 &
 done
 wait
 echo "create.pl --mcreate=0, 1 mount, 8 threads, 500 ops, debug on"
 for i in `seq 1 8`; do
-  perl create.pl --silent --mcreate=0 -- $MNT -1 500 &
+  perl $CREATE --silent --mcreate=0 -- $MNT -1 500 &
 done
 wait
 echo "rename.pl, 1 mount, 8 threads, 2000 ops, debug on"
@@ -115,12 +125,12 @@ wait
 debug_client_off
 echo "create.pl, 1 mount, 8 threads, 2000 ops, debug off"
 for i in `seq 1 8`; do
-  perl create.pl --silent -- $MNT -1 2000 &
+  perl $CREATE --silent -- $MNT -1 2000 &
 done
 wait
 echo "create.pl --mcreate=0, 1 mount, 8 threads, 2000 ops, debug off"
 for i in `seq 1 8`; do
-  perl create.pl --silent --mcreate=0 -- $MNT -1 2000 &
+  perl $CREATE --silent --mcreate=0 -- $MNT -1 2000 &
 done
 wait
 echo "rename.pl, 1 mount, 8 threads, 2000 ops, debug off"
@@ -128,3 +138,9 @@ for i in `seq 1 8`; do
   perl rename.pl --silent $MNT 2000 &
 done
 wait
+sh rundbench 1
+sh rundbench 2
+sh rundbench 4
+sh rundbench 8
+sh rundbench 16
+sh rundbench 32
index 286f417..bee6588 100755 (executable)
@@ -3,8 +3,14 @@
 # the CVS HEAD are allowed.
 set -vxe
 
+[ "$CONFIGS" -a -z "$SANITYN" ] && SANITYN=no
 [ "$CONFIGS" ] || CONFIGS="local lov"
-[ "$THREADS" ] || THREADS=1
+[ "$MAX_THREADS" ] || MAX_THREADS=50
+if [ -z "$THREADS" ]; then
+       KB=`awk '/MemTotal:/ { print $2 }' /proc/meminfo`
+       THREADS=`expr $KB / 16384`
+       [ $THREADS -gt $MAX_THREADS ] && THREADS=$MAX_THREADS
+fi
 [ "$SIZE" ] || SIZE=20480
 [ "$RSIZE" ] || RSIZE=64
 [ "$UID" ] || UID=1000
@@ -27,13 +33,17 @@ for NAME in $CONFIGS; do
 
        if [ "$DBENCH" != "no" ]; then
                mount | grep $MNT || sh llmount.sh
+               SPACE=`df $MNT | tail -1 | awk '{ print $4 }'`
+               DB_THREADS=`expr $SPACE / 50000`
+               [ $THREADS -lt $DB_THREADS ] && DB_THREADS=$THREADS
+
                $DEBUG_OFF
                sh rundbench 1
                sh llmountcleanup.sh
                sh llrmount.sh
-               if [ $THREADS -gt 1 ]; then
+               if [ $DB_THREADS -gt 1 ]; then
                        $DEBUG_OFF
-                       sh rundbench $THREADS
+                       sh rundbench $DB_THREADS
                        sh llmountcleanup.sh
                        sh llrmount.sh
                fi
@@ -58,21 +68,24 @@ for NAME in $CONFIGS; do
        fi
        if [ "$IOZONE_DIR" != "no" ]; then
                mount | grep $MNT || sh llmount.sh
+               SPACE=`df $MNT | tail -1 | awk '{ print $4 }'`
+               IOZ_THREADS=`expr $SPACE / $SIZE`
+               [ $THREADS -lt $IOZ_THREADS ] && IOZ_THREADS=$THREADS
+
                $DEBUG_OFF
                iozone -I $IOZONE_OPTS $IOZONE_FILE.odir
                IOZVER=`iozone -v | awk '/Revision:/ { print $3 }' | tr -d '.'`
                sh llmountcleanup.sh
                sh llrmount.sh
-               if [ "$THREADS" -gt 1 -a "$IOZVER" -ge 3145 ]; then
+               if [ "$IOZ_THREADS" -gt 1 -a "$IOZVER" -ge 3145 ]; then
                        $DEBUG_OFF
                        THREAD=1
                        IOZONE_FILE="-F "
-                       SIZE=`expr $SIZE / $THREADS`
-                       while [ $THREAD -le $THREADS ]; do
+                       while [ $THREAD -le $IOZ_THREADS ]; do
                                IOZONE_FILE="$IOZONE_FILE $MNT/iozone.$THREAD"
                                THREAD=`expr $THREAD + 1`
                        done
-                       iozone -I $IOZONE_OPTS -t $THREADS $IOZONE_FILE
+                       iozone -I $IOZONE_OPTS -t $IOZ_THREADS $IOZONE_FILE
                        sh llmountcleanup.sh
                        sh llrmount.sh
                elif [ $IOZVER -lt 3145 ]; then
@@ -90,5 +103,9 @@ for NAME in $CONFIGS; do
        mount | grep $MNT && sh llmountcleanup.sh
 done
 
-[ "$SANITYN" != "no" ] && NAME=mount2 sh sanityN.sh
-
+if [ "$SANITYN" != "no" ]; then
+       export NAME=mount2
+       mount | grep $MNT || sh llmount.sh
+       sh sanityN.sh
+       mount | grep $MNT && sh llmountcleanup.sh
+fi
index 6dc6124..6d7b7f4 100644 (file)
@@ -6,12 +6,12 @@ LMC="save_cmd"
 LMC_REAL="../../lustre/utils/lmc -m $config"
 
 TCPBUF=1048576
-OST=ba-ost-1
+OST=${OST:-ba-ost-1}
 CLIENT=client
  
 UUIDLIST=${UUIDLIST:-/usr/local/admin/ba-ost/UUID.txt}
 
-h2ip () {
+h2tcp () {
     echo "${1}"
 }
 BATCH=/tmp/lmc-batch.$$
@@ -24,15 +24,15 @@ save_cmd() {
 # Client node
 ${LMC} --add net --node $CLIENT --tcpbuf $TCPBUF --nid '*' --nettype tcp
 
-OBD_UUID=`awk "/$OST / { print \\$3 }" $UUIDLIST`
-[ "$OBD_UUID" ] && OBD_UUID="--obduuid=$OBD_UUID" || echo "$OST: no UUID"
+OST_UUID=`awk "/$OST / { print \\$3 }" $UUIDLIST`
+[ "$OST_UUID" ] && OST_UUID="--ostuuid=$OST_UUID" || echo "$OST: no UUID"
 
 # server node
 ${LMC} --add net --node $OST --tcpbuf $TCPBUF --nid $OST --nettype tcp
-${LMC} --add ost --node $OST --obd obd1 --obdtype=obdecho -obduuid $OBD_UUID 
+${LMC} --add ost --node $OST --ost ost1 --obdtype=obdecho $OST_UUID 
 
 # osc on client
-${LMC} --add echo_client --node $CLIENT --obd obd1
+${LMC} --add echo_client --node $CLIENT --ost ost1
 
 $LMC_REAL --batch $BATCH
 rm -f $BATCH
index badc63b..ac05660 100644 (file)
@@ -21,7 +21,7 @@ MDS=`hostname`
  
 UUIDLIST=${UUIDLIST:-/usr/local/admin/ba-ost/UUID.txt}
 
-h2ip () {
+h2tcp () {
     echo "${1}"
 }
 BATCH=/tmp/lmc-batch.$$
@@ -35,19 +35,19 @@ save_cmd() {
 ${LMC} --add net --node $MDS --tcpbuf $TCPBUF --nid $MDS --nettype tcp
 ${LMC} --add mds --node $MDS --mds mds1 --dev /tmp/mds1 --size 50000
 
-OBD_UUID=`awk "/$OST / { print \\$3 }" $UUIDLIST`
-[ "$OBD_UUID" ] && OBD_UUID="--obduuid $OBD_UUID" || echo "$OST: no UUID"
+OST_UUID=`awk "/$OST / { print \\$3 }" $UUIDLIST`
+[ "$OST_UUID" ] && OST_UUID="--ostuuid $OST_UUID" || echo "$OST: no UUID"
 
 # server node
 ${LMC} --add net --node $OST --tcpbuf $TCPBUF --nid $OST --nettype tcp
-${LMC} --add ost --node $OST --obd obd1 $OBD_UUID --dev bluearc
+${LMC} --add ost --node $OST --ost ost1 $OST_UUID --dev bluearc
 
 # mount point on the MDS/client
-${LMC} --add mtpt --node $MDS --path /mnt/lustre --mds mds1 --lov obd1
+${LMC} --add mtpt --node $MDS --path /mnt/lustre --mds mds1 --lov ost1
 
 # other clients
 ${LMC} --add net --node client --tcpbuf $TCPBUF --nid '*' --nettype tcp
-${LMC} --add mtpt --node client --path /mnt/lustre --mds mds1 --lov obd1
+${LMC} --add mtpt --node client --path /mnt/lustre --mds mds1 --lov ost1
 
 $LMC_REAL --batch $BATCH
 rm -f $BATCH
diff --git a/lustre/tests/compile.sh b/lustre/tests/compile.sh
new file mode 100644 (file)
index 0000000..13c142e
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+set -evx
+
+MNT=${MNT:-/mnt/lustre}
+DIR=${DIR:-$MNT}
+SRC=${SRC:-`dirname $0`/../..}
+while date; do
+       for i in portals lustre; do
+               TGT=$DIR/$i
+               [ -d $TGT ] || cp -av $SRC/$i/ $TGT
+               make -C $TGT clean
+               make -C $TGT -j2
+               make -C $TGT clean
+       done
+done
index e495517..3299011 100644 (file)
@@ -4,57 +4,64 @@
 #include <fcntl.h>
 #include <stdlib.h>
 #include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <sys/mman.h>
 
 // not correctly in the headers yet!!
 #ifndef O_DIRECT
-#define O_DIRECT        040000 /* direct disk access hint */
+#define O_DIRECT         040000 /* direct disk access hint */
 #endif
 
-#define BLOCKSIZE 4096
-
 int main(int argc, char **argv)
 {
         int fd;
         char *buf;
-        int pages;
+        int blocks;
+        struct stat st;
         int rc;
 
         if (argc != 3) {
-                printf("Usage: %s file nr_pages\n", argv[0]);
+                printf("Usage: %s file nr_blocks\n", argv[0]);
                 return 1;
         }
 
-        pages = strtoul(argv[2], 0, 0);
-        printf("directio on %s for %d pages \n", argv[1], pages);
+        blocks = strtoul(argv[2], 0, 0);
+        fd = open(argv[1], O_DIRECT | O_RDWR | O_CREAT, 0644);
+        if (fd == -1) {
+                printf("Cannot open %s:  %s\n", argv[1], strerror(errno));
+                return 1;
+        }
 
-        buf = mmap(0, pages * BLOCKSIZE, PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANON, 0, 0);
-        if (!buf) {
-                printf("No memory %s\n", strerror(errno));
+        if (fstat(fd, &st) < 0) {
+                printf("Cannot stat %s:  %s\n", argv[1], strerror(errno));
                 return 1;
         }
 
-        fd = open(argv[1], O_DIRECT | O_RDWR | O_CREAT);
-        if (fd == -1) {
-                printf("Cannot open %s:  %s\n", argv[1], strerror(errno));
+        printf("directio on %s for %dx%lu blocks \n", argv[1], blocks,
+               st.st_blksize);
+
+        buf = mmap(0, blocks * st.st_blksize, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANON, 0, 0);
+        if (!buf) {
+                printf("No memory %s\n", strerror(errno));
                 return 1;
         }
 
-        rc = read(fd, buf, pages * BLOCKSIZE);
-        if (rc != pages * BLOCKSIZE) {
-                printf("Read error: %s, rc %d\n", strerror(errno), rc);
+        rc = write(fd, buf, blocks * st.st_blksize);
+        if (rc != blocks * st.st_blksize) {
+                printf("Write error %s (rc = %d)\n", strerror(errno), rc);
                 return 1;
         }
 
-        if ( lseek(fd, 0, SEEK_SET) != 0 ) {
+        if (lseek(fd, 0, SEEK_SET) != 0) {
                 printf("Cannot seek %s\n", strerror(errno));
                 return 1;
         }
 
-        rc = write(fd, buf, pages * BLOCKSIZE);
-        if (rc != pages * BLOCKSIZE) {
-                printf("Write error %s\n", strerror(errno));
+        rc = read(fd, buf, blocks * st.st_blksize);
+        if (rc != blocks * st.st_blksize) {
+                printf("Read error: %s (rc = %d)\n", strerror(errno), rc);
                 return 1;
         }
 
index f8a1fd5..38e79c8 100755 (executable)
@@ -1,11 +1,15 @@
 #!/bin/sh
 TMP=${TMP:-/tmp}
+LCMD=$TMP/lkcd-cmds-`hostname`
+echo "Storing LKCD module info in $LCMD"
 cat /tmp/ogdb-`hostname` | while read JUNK M JUNK; do
-       MOD="../$M"
+       DIR=`dirname $M`
+       DIR=`cd $PWD/../$DIR; pwd`
+       MOD="$DIR/`basename $M`"
        MAP=`echo $MOD | sed -e 's/\.o$/.map/'`
-       MODNAME=`basename $MOD | sed -e 's/\.o$//'`
+       MODNAME=`basename $M | sed -e 's/\.o$//'`
 
        nm $MOD > $MAP
-       echo namelist -a $PWD/$MOD 
-       echo symtab -a $PWD/$MAP $MODNAME
+       echo namelist -a $MOD  | tee -a $LCMD
+       echo symtab -a $MAP $MODNAME | tee -a $LCMD
 done
index bc2a5e7..c490856 100755 (executable)
@@ -15,8 +15,12 @@ if [ "$LUSTRE" ]; then
   lustre_opt="--lustre=$LUSTRE"
 fi
 
+if [ "$1" = "-v" ]; then
+  verbose="-v"
+fi
+
 [ -x $LCONF ] || chmod a+rx $LCONF
 
 sh $mkconfig $config || exit 1
 
-${LCONF} $portals_opt $lustre_opt --reformat --gdb $config || exit 2
+${LCONF} $portals_opt $lustre_opt --reformat --gdb $verbose $config  || exit 2
index 13af9d6..2132801 100755 (executable)
@@ -1,3 +1,4 @@
+
 #!/bin/bash
 
 config=${1:-local.xml}
@@ -29,10 +30,10 @@ ${LMC} --add node --node localhost || exit 10
 ${LMC} --add net --node  localhost --nid localhost --nettype tcp || exit 11
 
 # configure mds server
-${LMC} --add mds  --node localhost --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 20
+${LMC} --add mds  --node localhost --mds mds1 $FSTYPE --dev $MDSDEV --size $MDSSIZE || exit 20
 
 # configure ost
-${LMC} --add ost --node localhost --ost obd1 --dev $OSTDEV --size  $OSTSIZE || exit 30
+${LMC} --add ost --node localhost --ost obd1 $FSTYPE --dev $OSTDEV --size  $OSTSIZE || exit 30
 
 # create client config
 ${LMC} --add mtpt --node localhost --path /mnt/lustre --mds mds1 --ost obd1 || exit 40
index cd569b6..0401bf5 100755 (executable)
@@ -11,7 +11,7 @@ SERVER_CNT=62
 
 TCPBUF=1048576
  
-h2ip () {
+h2tcp () {
     echo "${1}"
 }
 BATCH=/tmp/lmc-batch.$$
index d0320fd..29ec215 100755 (executable)
@@ -6,8 +6,10 @@ CONFIG=mcr-mds-failover.xml
 LUSTRE_QUERY=/usr/local/cfs/lustre-failover/lustre-query
 GW_NODE=mcr21
 CLIENT_ELAN=`hostname | sed s/[^0-9]*//;`
-OST_BA=ba50
-OST_UUID=10400010-5dec-11c2-0b5f-00301700041a
+OST=${OST:-ba50}
+UUIDLIST=${UUIDLIST:-/usr/local/admin/ba-ost/UUID.txt}
+OST_UUID=`awk "/$OST / { print \\$3 }" $UUIDLIST`
+[ "$OST_UUID" ] && OST_UUID="--ostuuid=$OST_UUID" || echo "$OST: no UUID"
 MDS_DEVICE=/dev/sda3
 MDS_SIZE=500000
 TCPBUF=1048576
@@ -21,14 +23,14 @@ h2elan () {
     echo $1 | sed 's/[^0-9]*//g'
 }
 
-h2ip () {
+h2tcp () {
     echo "${1}"
 }
 
 
 # create client node
 $LMC -o $CONFIG --add net --node client --nid '*' --nettype elan
-$LMC -m $CONFIG --add net --router --node mcr21 --tcpbuf $TCPBUF --nid `h2ip $GW_NODE` --nettype tcp
+$LMC -m $CONFIG --add net --router --node mcr21 --tcpbuf $TCPBUF --nid `h2tcp $GW_NODE` --nettype tcp
 $LMC -m $CONFIG --add net --router --node mcr21 --nid `h2elan $GW_NODE` --nettype elan
 $LMC -m $CONFIG --add route --node $GW_NODE --nettype elan --gw `h2elan $GW_NODE` --lo $CLIENT_ELAN 
 
@@ -40,9 +42,9 @@ for mds in $MDSNODES; do
 done
 
 # create OST node entry
-$LMC -m $CONFIG --add net --node $OST_BA --tcpbuf $TCPBUF --nid $OST_BA --nettype tcp
-$LMC -m $CONFIG --add ost --node $OST_BA --obd obd_$OST_BA --obduuid $OST_UUID --dev bluearc
-$LMC -m $CONFIG --add route --node $GW_NODE --nettype tcp --gw `h2ip $GW_NODE` --lo $OST_BA
+$LMC -m $CONFIG --add net --node $OST --tcpbuf $TCPBUF --nid $OST --nettype tcp
+$LMC -m $CONFIG --add ost --node $OST --ost ost_$OST $OST_UUID --dev bluearc
+$LMC -m $CONFIG --add route --node $GW_NODE --nettype tcp --gw `h2tcp $GW_NODE` --lo $OST
 
 # mount
-$LMC -m $CONFIG --add mtpt --node client --path /mnt/lustre --mds mds_$ACTIVEMDS --lov obd_$OST_BA
+$LMC -m $CONFIG --add mtpt --node client --path /mnt/lustre --mds mds_$ACTIVEMDS --lov ost_$OST
index 096520e..3b1d961 100755 (executable)
@@ -33,7 +33,7 @@ h2elan () {
     echo $1 | sed 's/[^0-9]*//g'
 }
 
-h2ip () {
+h2tcp () {
     echo "${1}"
 }
 
@@ -65,7 +65,7 @@ while (( $gw < $GW_CNT + GW_START ));
 do 
    gwnode=$BASE`gw2node $gw`
    echo "Router: $gwnode"
-   ${LMC} --add net --router --node $gwnode --tcpbuf $TCPBUF --nid `h2ip $gwnode`  --nettype tcp || exit 1
+   ${LMC} --add net --router --node $gwnode --tcpbuf $TCPBUF --nid `h2tcp $gwnode`  --nettype tcp || exit 1
    ${LMC} --add net --node $gwnode --nid `h2elan $gwnode` --nettype elan || exit 1
    ${LMC} --add route --node $gwnode --nettype elan --gw `h2elan $gwnode` --lo `h2elan $CLIENT_LO` --hi `h2elan $CLIENT_HI` || exit 2
 
@@ -74,14 +74,14 @@ do
    do
       OST=${OSTBASE}$server
       echo "server: $OST"
-      OBD_UUID=`awk "/$OST / { print \\$3 }" $UUIDLIST`
-      [ "$OBD_UUID" ] && OBD_UUID="--obduuid $OBD_UUID" || echo "$OST: no UUID"
+      OST_UUID=`awk "/$OST / { print \\$3 }" $UUIDLIST`
+      [ "$OST_UUID" ] && OST_UUID="--ostuuid $OST_UUID" || echo "$OST: no UUID"
       # server node
       ${LMC} --add net --node $OST --tcpbuf $TCPBUF --nid $OST --nettype tcp || exit 1
       # the device on the server
       ${LMC} --add ost --lov lov1 --node $OST $OBD_UUID --dev bluearc || exit 3
       # route to server
-      ${LMC} --add route --node $gwnode --nettype tcp --gw `h2ip $gwnode` --lo $OST || exit 2
+      ${LMC} --add route --node $gwnode --nettype tcp --gw `h2tcp $gwnode` --lo $OST || exit 2
       let server=$server+1 
       let i=$i+1
    done
index f4e30eb..4777337 100755 (executable)
@@ -19,7 +19,7 @@ h2elan () {
     echo $1 | sed 's/[^0-9]*//g'
 }
 
-h2ip () {
+h2tcp () {
     echo "${1}"
 }
 
@@ -28,7 +28,7 @@ h2ip () {
 # Client node
 ${LMC} --add net --node client --nid '*' --nettype elan || exit 1
 # Router node
-${LMC} --add net --router --node $ROUTER --tcpbuf $TCPBUF --nid `h2ip $ROUTER` --nettype tcp || exit 1
+${LMC} --add net --router --node $ROUTER --tcpbuf $TCPBUF --nid `h2tcp $ROUTER` --nettype tcp || exit 1
 ${LMC} --add net --node $ROUTER --nid `h2elan $ROUTER` --nettype elan|| exit 1
 ${LMC} -m $config --add route --node $ROUTER --nettype elan --gw `h2elan $ROUTER` --lo `h2elan $CLIENT_LO` --hi `h2elan $CLIENT_HI` || exit 2
 
@@ -37,7 +37,7 @@ for s in $SERVERS
    # server node
    ${LMC} --add net --node $s --tcpbuf $TCPBUF --nid $s --nettype tcp || exit 1
    # route to server
-   ${LMC} --add route --node $ROUTER --nettype tcp --gw `h2ip $ROUTER` --lo $s || exit 2
+   ${LMC} --add route --node $ROUTER --nettype tcp --gw `h2tcp $ROUTER` --lo $s || exit 2
    # the device on the server
    ${LMC} --add ost --node $s --obd obd_$s --obdtype=obdecho || exit 3
    # attach to the device on the client (this would normally be a mount)
index 7350343..cce8878 100755 (executable)
@@ -21,7 +21,7 @@ h2elan () {
     echo $1 | sed 's/[^0-9]*//g'
 }
 
-h2ip () {
+h2tcp () {
     echo "${1}"
 }
 
@@ -30,7 +30,7 @@ h2ip () {
 # Client node
 ${LMC} --add net --node client --nid '*' --nettype elan || exit 1
 # Router node
-${LMC} --add net --router --node $ROUTER --tcpbuf $TCPBUF --nid `h2ip $ROUTER`  --nettype tcp || exit 1
+${LMC} --add net --router --node $ROUTER --tcpbuf $TCPBUF --nid `h2tcp $ROUTER`  --nettype tcp || exit 1
 ${LMC} --add net --node $ROUTER --nid `h2elan $ROUTER` --nettype elan|| exit 1
 ${LMC} --add route --node $ROUTER --gw `h2elan $ROUTER` --lo `h2elan $CLIENT_LO` --hi `h2elan $CLIENT_HI` --nettype elan || exit 2
 
@@ -45,7 +45,7 @@ for s in $SERVERS
    # server node
    ${LMC} --add net --node $s --tcpbuf $TCPBUF --nid $s --nettype tcp || exit 1
    # route to server
-   ${LMC} --add route --node $ROUTER --nettype tcp --gw `h2ip $ROUTER` --lo $s || exit 2
+   ${LMC} --add route --node $ROUTER --nettype tcp --gw `h2tcp $ROUTER` --lo $s || exit 2
    # the device on the server
    #${LMC} --format --lov lov1 --node $s --ost bluearc || exit 3
    ${LMC} --add ost  --lov lov1 --node $s --dev bluearc --format || exit 3
diff --git a/lustre/tests/mlink.c b/lustre/tests/mlink.c
new file mode 100755 (executable)
index 0000000..5688b9f
--- /dev/null
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+int main(int argc, char ** argv)
+{
+        int rc;
+
+        if (argc < 3) { 
+                printf("Usage: %s file link\n", argv[0]);
+                return 1;
+        }
+
+        rc = link(argv[1], argv[2]);
+        if (rc) { 
+                printf("link(%s, %s) error: %s\n", argv[1], argv[2],
+                      strerror(errno));
+               return errno;
+        }
+       return 0;
+} 
index f1c00b4..07de3ed 100644 (file)
@@ -2,7 +2,9 @@
 
 config=${1:-mount2.xml}
 
-LMC="${LMC:-../utils/lmc} -m $config"
+SRCDIR=`dirname $0`
+PATH=$SRCDIR:$SRCDIR/../utils:$PATH
+LMC="${LMC:-lmc} -m $config"
 TMP=${TMP:-/tmp}
 
 MDSDEV=${MDSDEV:-$TMP/mds1}
@@ -11,17 +13,6 @@ MDSSIZE=${MDSSIZE:-50000}
 OSTDEV=${OSTDEV:-$TMP/ost1}
 OSTSIZE=${OSTSIZE:-200000}
 
-kver=`uname -r | cut -d "." -f 1,2`
-
-case $kver in
-  2.4) FSTYPE="--fstype=extN"  ;;
-  2.5) FSTYPE="--fstype=ext3"  ;;
-  *) echo "Kernel version $kver not supported"
-     exit 1
-     ;;
-esac
-
-
 rm -f $config
 
 # create nodes
@@ -32,8 +23,8 @@ ${LMC} --add net --node  localhost --nid localhost --nettype tcp || exit 11
 ${LMC} --add mds  --node localhost --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 20
 
 # configure ost
-${LMC} --add ost --node localhost --obd obd1 --dev $OSTDEV --size  $OSTSIZE || exit 30
+${LMC} --add ost --node localhost --ost ost1 --dev $OSTDEV --size  $OSTSIZE || exit 30
 
 # create client config
-${LMC} --add mtpt --node localhost --path /mnt/lustre1 --mds mds1 --obd obd1 || exit 40
-${LMC} --add mtpt --node localhost --path /mnt/lustre2 --mds mds1 --obd obd1 || exit 40
+${LMC} --add mtpt --node localhost --path /mnt/lustre1 --mds mds1 --ost ost1 || exit 40
+${LMC} --add mtpt --node localhost --path /mnt/lustre2 --mds mds1 --ost ost1 || exit 40
index 2f41884..de4815c 100644 (file)
@@ -2,6 +2,8 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <liblustre.h>
+#include <linux/obd.h>
 #include <linux/lustre_lib.h>
 #include <linux/lustre_lite.h>
 #include <linux/obd_lov.h>
diff --git a/lustre/tests/recovery-cleanup.sh b/lustre/tests/recovery-cleanup.sh
new file mode 100755 (executable)
index 0000000..481ebaa
--- /dev/null
@@ -0,0 +1,134 @@
+#!/bin/sh
+
+set -ex
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+PATH=$PATH:$LUSTRE/utils:$LUSTRE/tests
+
+. $LUSTRE/../ltest/functional/llite/common/common.sh
+
+PDSH='pdsh -S -w'
+
+# XXX I wish all this stuff was in some default-config.sh somewhere
+MDSNODE=${MDSNODE:-mdev6}
+OSTNODE=${OSTNODE:-mdev7}
+CLIENT=${CLIENTNODE:-mdev8}
+NETWORKTYPE=${NETWORKTYPE:-tcp}
+MOUNTPT=${MOUNTPT:-/mnt/lustre}
+CONFIG=recovery-small.xml
+MDSDEV=/tmp/mds
+OSTDEV=/tmp/ost
+MDSSIZE=100000
+OSTSIZE=100000
+
+do_mds() {
+    $PDSH $MDSNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@"
+}
+
+do_client() {
+    $PDSH $CLIENT "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@"
+}
+
+do_ost() {
+    $PDSH $OSTNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@"
+}
+
+drop_request() {
+    do_mds "echo 0x121 > /proc/sys/lustre/fail_loc"
+    do_client "$1 & sleep ${TIMEOUT:-5}; sleep 2; kill \$!"
+    do_mds "echo 0 > /proc/sys/lustre/fail_loc"
+}
+
+make_config() {
+    rm -f $CONFIG
+    for NODE in $CLIENT $MDSNODE $OSTNODE; do
+       lmc -m $CONFIG --add net --node $NODE --nid `h2$NETWORKTYPE $NODE` \
+           --nettype $NETWORKTYPE || exit 4
+    done
+    lmc -m $CONFIG --add mds --node $MDSNODE --mds mds1 --dev $MDSDEV \
+        --size $MDSSIZE || exit 5
+    lmc -m $CONFIG --add ost --node $OSTNODE --ost ost1 --dev $OSTDEV \
+        --size $OSTSIZE || exit 6
+    lmc -m $CONFIG --add mtpt --node $CLIENT --path $MOUNTPT --mds mds1 \
+        --ost ost1 || exit 7
+}
+
+start_mds() {
+    do_mds "lconf $@ $CONFIG"
+}
+
+shutdown_mds() {
+    do_mds "lconf $@ --cleanup $CONFIG"
+}
+
+start_ost() {
+    do_ost "lconf $@ $CONFIG"
+}
+
+shutdown_ost() {
+    do_ost "lconf $@ --cleanup $CONFIG"
+}
+
+mount_client() {
+    do_client "lconf $@ $CONFIG"
+}
+
+unmount_client() {
+    do_client "lconf $@ --cleanup $CONFIG"
+}
+
+setup() {
+    make_config
+    start_mds ${REFORMAT:---reformat}
+    start_ost ${REFORMAT:---reformat}
+    mount_client --timeout=${TIMEOUT:-5} --recovery_upcall=/bin/true
+}
+
+cleanup() {
+    do_mds "echo 0 > /proc/sys/lustre/fail_loc"
+    unmount_client $@ || true
+    shutdown_mds $@ || true
+    shutdown_ost $@ || true
+}
+
+wait_for_timeout() {
+    # wait to make sure we enter recovery
+    # it'd be better if the upcall notified us somehow, I think
+    sleep $(( ${TIMEOUT:-5} + 2 ))
+}
+
+try_to_cleanup() {
+    kill -INT $!
+    unmount_client --force
+    mount_client --timeout=${TIMEOUT:-5} --recovery_upcall=/bin/true
+}
+
+if [ ! -z "$ONLY" ]; then
+    eval "$ONLY"
+    exit $?
+fi
+
+setup
+drop_request "mcreate /mnt/lustre/1" & wait_for_timeout
+try_to_cleanup
+
+drop_request "tchmod 111 /mnt/lustre/2" & wait_for_timeout
+try_to_cleanup
+
+drop_request "statone /mnt/lustre/2" & wait_for_timeout
+try_to_cleanup
+
+do_client "cp /etc/resolv.conf /mnt/lustre/resolv.conf"
+drop_request "cat /mnt/lustre/resolv.conf > /dev/null" & wait_for_timeout
+try_to_cleanup
+
+drop_request "mv /mnt/lustre/resolv.conf /mnt/lustre/renamed" & wait_for_timeout
+try_to_cleanup
+
+drop_request "mlink /mnt/lustre/renamed-again /mnt/lustre/link1" & wait_for_timeout
+try_to_cleanup
+
+drop_request "munlink /mnt/lustre/link1" & wait_for_timeout
+try_to_cleanup
+
+cleanup
index 26bb81f..7425e57 100755 (executable)
@@ -10,9 +10,9 @@ PATH=$PATH:$LUSTRE/utils:$LUSTRE/tests
 PDSH='pdsh -S -w'
 
 # XXX I wish all this stuff was in some default-config.sh somewhere
-MDSNODE=${MDSNODE:-dev2}
-OSTNODE=${OSTNODE:-dev3}
-CLIENT=${CLIENTNODE:-dev4}
+MDSNODE=${MDSNODE:-mdev6}
+OSTNODE=${OSTNODE:-mdev7}
+CLIENT=${CLIENTNODE:-mdev8}
 NETWORKTYPE=${NETWORKTYPE:-tcp}
 MOUNTPT=${MOUNTPT:-/mnt/lustre}
 CONFIG=recovery-small.xml
@@ -85,31 +85,28 @@ unmount_client() {
 
 setup() {
     make_config
-    start_mds --reformat
-    start_ost --reformat
+    start_mds ${REFORMAT:---reformat}
+    start_ost ${REFORMAT:---reformat}
     # XXX we should write our own upcall, when we move this somewhere better.
-    mount_client --timeout=10 \
+    mount_client --timeout=${TIMEOUT:-5} \
         --recovery_upcall=$PWD/../../ltest/functional/llite/09/client-upcall.sh
 }
 
 cleanup() {
-    unmount_client || true
-    shutdown_mds || true
-    shutdown_ost || true
+    do_mds "echo 0 > /proc/sys/lustre/fail_loc"
+    unmount_client $@ || true
+    shutdown_mds $@ || true
+    shutdown_ost $@ || true
 }
 
 replay() {
-    if [ $# -gt 1 ]; then
-        do_client "$1"
-        shift
-    fi
     do_mds "sync"
     do_mds 'echo -e "device \$mds1\\nprobe\\nnotransno\\nreadonly" | lctl'
     do_client "$1" &
     shutdown_mds -f
     start_mds
     wait
-    do_client "ls $MOUNPT" # trigger failover, if we haven't already
+    do_client "df -h $MOUNTPT" # trigger failover, if we haven't already
 }
 
 if [ ! -z "$ONLY" ]; then
@@ -120,5 +117,28 @@ fi
 setup
 drop_request "mcreate /mnt/lustre/1"
 drop_reply "mcreate /mnt/lustre/2"
-replay "mcreate /mnt/lustre/3"
+# replay "mcreate /mnt/lustre/3"
+
+drop_request "tchmod 111 /mnt/lustre/2"
+drop_reply "tchmod 666 /mnt/lustre/2"
+# replay "tchmod 444 /mnt/lustre/2"
+
+drop_request "statone /mnt/lustre/2"
+drop_reply "statone /mnt/lustre/2"
+# replay "statone /mnt/lustre/2"
+
+do_client "cp /etc/resolv.conf /mnt/lustre/resolv.conf"
+drop_request "cat /mnt/lustre/resolv.conf > /dev/null"
+drop_reply "cat /mnt/lustre/resolv.conf > /dev/null"
+
+drop_request "mv /mnt/lustre/resolv.conf /mnt/lustre/renamed"
+drop_reply "mv /mnt/lustre/renamed /mnt/lustre/renamed-again"
+
+drop_request "mlink /mnt/lustre/renamed-again /mnt/lustre/link1"
+drop_reply "mlink /mnt/lustre/renamed-again /mnt/lustre/link2"
+
+drop_request "munlink /mnt/lustre/link1"
+drop_reply "munlink /mnt/lustre/link2"
+
+
 cleanup
index 4fc00b2..c2eec04 100755 (executable)
@@ -1,17 +1,17 @@
 #!/bin/sh
 [ -z "$SIZE" ] && SIZE=5g
-[ -z "$LOOPS" ] && LOOPS=9999
+[ -z "$COUNT" ] && COUNT=100
 [ -z "$VERIFY" ] && VERIFY="-+d"
 [ -z "$ODIR" ] && ODIR="-I"
 [ -z "$REC" ] && REC=64
 [ -z "$FILE" ] && FILE=/mnt/lustre/iozone.$$
 [ $1 ] && SIZE=$1
-COUNT=0
+LOOP=0
 rm -f endiozone
 echo 0 > /proc/sys/portals/debug
 while date; do
-       echo "Test #$COUNT"
+       LOOP=`expr $LOOP + 1`
+       echo "Test #$LOOP"
        iozone $VERIFY $ODIR -r $REC -i 0 -i 1 -f $FILE -s $SIZE 2>&1 || exit $?
-       COUNT=`expr $COUNT + 1`
-       [ -f endiozone -o $COUNT -ge $LOOPS ] && rm -f endiozone && exit 0
+       [ -f endiozone -o $LOOP -ge $COUNT ] && rm -f endiozone && exit 0
 done | tee /tmp/iozone.log
index 9c8f990..e59f5f4 100755 (executable)
@@ -11,7 +11,12 @@ fail() {
        exit $RC
 }
 
-export PATH=/sbin:/usr/sbin:$SRCDIR:$PATH
+log() {
+       echo "$*"
+       lctl mark "$*"
+}
+
+export PATH=/sbin:/usr/sbin:$SRCDIR:$SRCDIR/../utils:$PATH
 
 ERROR=
 SRC=/etc
@@ -20,6 +25,7 @@ SRC=/etc
 [ "$LCONF" ] || LCONF=$SRCDIR/../utils/lconf
 
 [ "$MCREATE" ] || MCREATE=$SRCDIR/../tests/mcreate
+
 [ "$MKDIRMANY" ] || MKDIRMANY=$SRCDIR/../tests/mkdirmany
 
 while [ "$1" ]; do
@@ -42,41 +48,41 @@ USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -1`
 USED=`expr $USED + 16` # Some space for the status file
 
 # let's start slowly here...
-echo "touching $OSCMT"
+log "touching $OSCMT"
 touch $OSCMT || fail "can't touch $OSCMT" 2
 HOSTS=$OSCMT/hosts.$$
 
 # this will cause the following cp to trigger bug #620096
-echo "create an empty file $HOSTS"
-$MCREATE $HOSTS
+log "create an empty file $HOSTS"
+mcreate $HOSTS
 
-echo "copying /etc/hosts to $HOSTS"
+log "copying /etc/hosts to $HOSTS"
 cp /etc/hosts $HOSTS || fail "can't cp /etc/hosts to $HOSTS" 3
-echo "comparing /etc/hosts and $HOSTS"
+log "comparing /etc/hosts and $HOSTS"
 diff -u /etc/hosts $HOSTS || fail "$HOSTS different" 4
-echo "renaming $HOSTS to $HOSTS.ren"
+log "renaming $HOSTS to $HOSTS.ren"
 mv $HOSTS $HOSTS.ren || fail "can't rename $HOSTS to $HOSTS.ren" 5
-echo "copying /etc/hosts to $HOSTS again"
+log "copying /etc/hosts to $HOSTS again"
 cp /etc/hosts $HOSTS || fail "can't cp /etc/hosts to $HOSTS again" 6
-echo "truncating $HOSTS"
+log "truncating $HOSTS"
 > $HOSTS || fail "can't truncate $HOSTS" 8
-echo "removing $HOSTS"
+log "removing $HOSTS"
 rm $HOSTS || fail "can't remove $HOSTS" 9
 
 DST=$OSCMT/runtest.$$
 # let's start slowly here...
-echo "creating $DST"
+log "creating $DST"
 mkdir $DST || fail "can't mkdir $DST" 10
 
 # ok, that hopefully worked, so let's do a little more, with files that
 # haven't changed in the last day (hopefully they don't change during test)
 FILES=`find $SRC -type f -mtime +1 -ctime +1 | head -$COUNT`
-echo "copying files from $SRC to $DST$SRC"
+log "copying files from $SRC to $DST$SRC"
 tar cf - $FILES | tar xvf - -C $DST || fail "copying $SRC" 11
 
-echo "comparing newly copied files"
+log "comparing newly copied files"
 for f in $FILES; do
-       [ $V ] && echo "verifying $DST/$f"
+       [ $V ] && log "verifying $DST/$f"
        diff -q $f $DST/$f || ERROR=11
 done
 
@@ -85,9 +91,9 @@ done
 sh llmountcleanup.sh || exit 19
 sh llrmount.sh || exit 20
 
-echo "comparing previously copied files"
+log "comparing previously copied files"
 for f in $FILES; do
-       [ $V ] && echo "verifying $DST/$f"
+       [ $V ] && log "verifying $DST/$f"
        diff -q $f $DST/$f || ERROR=22
 done
 
@@ -96,21 +102,23 @@ done
 sh llmountcleanup.sh || exit 19
 sh llrmount.sh || exit 20
 
-echo "renaming $HOSTS.ren to $HOSTS"
+log "renaming $HOSTS.ren to $HOSTS"
 mv $HOSTS.ren $HOSTS || fail "can't rename $HOSTS.ren to $HOSTS" 32
-echo "truncating $HOSTS"
+log "truncating $HOSTS"
 > $HOSTS || fail "can't truncate $HOSTS" 34
-echo "removing $HOSTS"
+log "removing $HOSTS"
 rm $HOSTS || fail "can't remove $HOSTS again" 36
-echo "removing $DST"
+log "removing $DST"
 rm -r $V $DST || fail "can't remove $DST" 37
 
 # mkdirmany test (bug 589)
-echo "running mkdirmany $OSCMT/base$$ 100"
+log "running mkdirmany $OSCMT/base$$ 100"
 $MKDIRMANY $OSCMT/base$$ 100 || fail "mkdirmany failed"
-echo "removing mkdirmany directories"
+log "removing mkdirmany directories"
 rmdir $OSCMT/base$$* || fail "mkdirmany cleanup failed"
 
+log "done"
+
 NOWUSED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -1`
 if [ $NOWUSED -gt $USED ]; then
        echo "Space not all freed: now ${NOWUSED}kB, was ${USED}kB." 1>&2
index 111606a..c3e201f 100644 (file)
@@ -27,7 +27,12 @@ start() {
 }
 START=${START:-start}
 
-error () { 
+log() {
+       echo "$*"
+       lctl mark "$*"
+}
+
+error() { 
     echo FAIL
     exit 1
 }
@@ -38,7 +43,7 @@ pass() {
 
 mount | grep $MOUNT || sh llmount.sh
 
-echo '== touch .../f ; rm .../f ======================== test 0'
+log '== touch .../f ; rm .../f ======================== test 0'
 touch $DIR/f
 $CHECKSTAT -t file $DIR/f || error 
 rm $DIR/f
@@ -47,7 +52,7 @@ pass
 $CLEAN
 $START
 
-echo '== mkdir .../d1; mkdir .../d1/d2 ================= test 1'
+log '== mkdir .../d1; mkdir .../d1/d2 ================= test 1'
 mkdir $DIR/d1
 mkdir $DIR/d1/d2
 $CHECKSTAT -t dir $DIR/d1/d2 || error
@@ -55,7 +60,7 @@ pass
 $CLEAN
 $START
 
-echo '== rmdir .../d1/d2; rmdir .../d1 ================= test 1b'
+log '== rmdir .../d1/d2; rmdir .../d1 ================= test 1b'
 rmdir $DIR/d1/d2
 rmdir $DIR/d1
 $CHECKSTAT -a $DIR/d1 || error
@@ -63,7 +68,7 @@ pass
 $CLEAN
 $START
 
-echo '== mkdir .../d2; touch .../d2/f ================== test 2'
+log '== mkdir .../d2; touch .../d2/f ================== test 2'
 mkdir $DIR/d2
 touch $DIR/d2/f
 $CHECKSTAT -t file $DIR/d2/f || error
@@ -71,46 +76,46 @@ pass
 $CLEAN
 $START
 
-echo '== rm -r .../d2; touch .../d2/f ================== test 2b'
+log '== rm -r .../d2; touch .../d2/f ================== test 2b'
 rm -r $DIR/d2
 $CHECKSTAT -a $DIR/d2 || error
 pass
 $CLEAN
 $START
 
-echo '== mkdir .../d3 ================================== test 3'
+log '== mkdir .../d3 ================================== test 3'
 mkdir $DIR/d3
 $CHECKSTAT -t dir $DIR/d3 || error
 pass
 $CLEAN
 $START
-echo '== touch .../d3/f ================================ test 3b'
+log '== touch .../d3/f ================================ test 3b'
 touch $DIR/d3/f
 $CHECKSTAT -t file $DIR/d3/f || error
 pass
 $CLEAN
 $START
-echo '== rm -r .../d3 ================================== test 3c'
+log '== rm -r .../d3 ================================== test 3c'
 rm -r $DIR/d3
 $CHECKSTAT -a $DIR/d3 || error
 pass
 $CLEAN
 $START
 
-echo '== mkdir .../d4 ================================== test 4'
+log '== mkdir .../d4 ================================== test 4'
 mkdir $DIR/d4
 $CHECKSTAT -t dir $DIR/d4 || error
 pass
 $CLEAN
 $START
-echo '== mkdir .../d4/d2 =============================== test 4b'
+log '== mkdir .../d4/d2 =============================== test 4b'
 mkdir $DIR/d4/d2
 $CHECKSTAT -t dir $DIR/d4/d2 || error
 pass
 $CLEAN
 $START
 
-echo '== mkdir .../d5; mkdir .../d5/d2; chmod .../d5/d2 = test 5'
+log '== mkdir .../d5; mkdir .../d5/d2; chmod .../d5/d2 = test 5'
 mkdir $DIR/d5
 mkdir $DIR/d5/d2
 chmod 0707 $DIR/d5/d2
@@ -119,7 +124,7 @@ pass
 $CLEAN
 $START
 
-echo '== touch .../f6; chmod .../f6 ==================== test 6'
+log '== touch .../f6; chmod .../f6 ==================== test 6'
 touch $DIR/f6
 chmod 0666 $DIR/f6
 $CHECKSTAT -t file -p 0666 $DIR/f6 || error
@@ -127,7 +132,7 @@ pass
 $CLEAN
 $START
 
-echo '== mkdir .../d7; mcreate .../d7/f; chmod .../d7/f = test 7'
+log '== mkdir .../d7; mcreate .../d7/f; chmod .../d7/f = test 7'
 mkdir $DIR/d7
 $MCREATE $DIR/d7/f
 chmod 0666 $DIR/d7/f
@@ -136,16 +141,16 @@ pass
 $CLEAN
 $START
 
-echo '== mkdir .../d7; mcreate .../d7/f2; chmod .../d7/f2 = test 7b'
+log '== mkdir .../d7; mcreate .../d7/f2; echo foo > .../d7/f2 = test 7b'
 $MCREATE $DIR/d7/f2
-echo -n foo > $DIR/d7/f2
+log -n foo > $DIR/d7/f2
 [ "`cat $DIR/d7/f2`" = "foo" ] || error
 $CHECKSTAT -t file -s 3 $DIR/d7/f2 || error
 pass
 $CLEAN
 $START
 
-echo '== mkdir .../d8; touch .../d8/f; chmod .../d8/f == test 8'
+log '== mkdir .../d8; touch .../d8/f; chmod .../d8/f == test 8'
 mkdir $DIR/d8
 touch $DIR/d8/f
 chmod 0666 $DIR/d8/f
@@ -155,7 +160,7 @@ $CLEAN
 $START
 
 
-echo '== mkdir .../d9 .../d9/d2 .../d9/d2/d3 =========== test 9'
+log '== mkdir .../d9 .../d9/d2 .../d9/d2/d3 =========== test 9'
 mkdir $DIR/d9
 mkdir $DIR/d9/d2
 mkdir $DIR/d9/d2/d3
@@ -165,7 +170,7 @@ $CLEAN
 $START
 
 
-echo '== mkdir .../d10 .../d10/d2; touch .../d10/d2/f = test 10'
+log '== mkdir .../d10 .../d10/d2; touch .../d10/d2/f = test 10'
 mkdir $DIR/d10
 mkdir $DIR/d10/d2
 touch $DIR/d10/d2/f
@@ -174,7 +179,7 @@ pass
 $CLEAN
 $START
 
-echo '== mkdir .../d11 d11/d2; chmod .../d11/d2 ======= test 11'
+log '== mkdir .../d11 d11/d2; chmod .../d11/d2 ======= test 11'
 mkdir $DIR/d11
 mkdir $DIR/d11/d2
 chmod 0666 $DIR/d11/d2
@@ -184,7 +189,7 @@ pass
 $CLEAN
 $START
 
-echo '== mkdir .../d12; touch .../d12/f; chmod .../d12/f == test 12'
+log '== mkdir .../d12; touch .../d12/f; chmod .../d12/f == test 12'
 mkdir $DIR/d12
 touch $DIR/d12/f
 chmod 0666 $DIR/d12/f
@@ -194,7 +199,7 @@ pass
 $CLEAN
 $START
 
-echo '== mkdir .../d13; creat .../d13/f;  .../d13/f; > .../d13/f == test 13'
+log '== mkdir .../d13; creat .../d13/f;  .../d13/f; > .../d13/f == test 13'
 mkdir $DIR/d13
 dd if=/dev/zero of=$DIR/d13/f count=10
 >  $DIR/d13/f
@@ -203,7 +208,7 @@ pass
 $CLEAN
 $START
 
-echo '================================================== test 14'
+log '================================================== test 14'
 mkdir $DIR/d14
 touch $DIR/d14/f
 rm $DIR/d14/f
@@ -212,7 +217,7 @@ pass
 $CLEAN
 $START
 
-echo '================================================== test 15'
+log '================================================== test 15'
 mkdir $DIR/d15
 touch $DIR/d15/f
 mv $DIR/d15/f $DIR/d15/f2
@@ -221,7 +226,7 @@ pass
 $CLEAN
 $START
 
-echo '================================================== test 16'
+log '================================================== test 16'
 mkdir $DIR/d16
 touch $DIR/d16/f
 rm -rf $DIR/d16/f
@@ -230,7 +235,7 @@ pass
 $CLEAN
 $START
 
-echo '== symlinks: create, remove (dangling and real) == test 17'
+log '== symlinks: create, remove (dangling and real) == test 17'
 mkdir $DIR/d17
 touch $DIR/d17/f
 ln -s $DIR/d17/f $DIR/d17/l-exist
@@ -248,14 +253,14 @@ pass
 $CLEAN
 $START
 
-echo "== touch .../f ; ls ... ========================= test 18"
+log "== touch .../f ; ls ... ========================= test 18"
 touch $DIR/f
 ls $DIR || error
 pass
 $CLEAN
 $START
 
-echo "== touch .../f ; ls -l ... ====================== test 19"
+log "== touch .../f ; ls -l ... ====================== test 19"
 touch $DIR/f
 ls -l $DIR
 rm $DIR/f
@@ -264,22 +269,22 @@ pass
 $CLEAN
 $START
 
-echo "== touch .../f ; ls -l ... ====================== test 20"
+log "== touch .../f ; ls -l ... ====================== test 20"
 touch $DIR/f
 rm $DIR/f
-echo "1 done"
+log "1 done"
 touch $DIR/f
 rm $DIR/f
-echo "2 done"
+log "2 done"
 touch $DIR/f
 rm $DIR/f
-echo "3 done"
+log "3 done"
 $CHECKSTAT -a $DIR/f || error
 pass
 $CLEAN
 $START
 
-echo '== write to dangling link ======================== test 21'
+log '== write to dangling link ======================== test 21'
 mkdir $DIR/d21
 [ -f $DIR/d21/dangle ] && rm -f $DIR/d21/dangle
 ln -s dangle $DIR/d21/link
@@ -291,7 +296,7 @@ pass
 $CLEAN
 $START
 
-echo '== unpack tar archive as non-root user =========== test 22'
+log '== unpack tar archive as non-root user =========== test 22'
 mkdir $DIR/d22
 which sudo && chown 4711 $DIR/d22
 SUDO=`which sudo 2> /dev/null` && SUDO="$SUDO -u #4711" || SUDO=""
@@ -305,7 +310,7 @@ pass
 $CLEAN
 $START
 
-echo '== O_CREAT|O_EXCL in subdir ====================== test 23'
+log '== O_CREAT|O_EXCL in subdir ====================== test 23'
 mkdir $DIR/d23
 $TOEXCL $DIR/d23/f23
 $TOEXCL -e $DIR/d23/f23 || error
@@ -315,7 +320,7 @@ $START
 
 echo '== rename sanity ================================= test24'
 echo '-- same directory rename'
-echo '-- test 24-R1: touch a ; rename a b'
+log '-- test 24-R1: touch a ; rename a b'
 mkdir $DIR/R1
 touch $DIR/R1/f
 mv $DIR/R1/f $DIR/R1/g
@@ -324,7 +329,7 @@ pass
 $CLEAN
 $START
 
-echo '-- test 24-R2: touch a b ; rename a b;'
+log '-- test 24-R2: touch a b ; rename a b;'
 mkdir $DIR/R2
 touch $DIR/R2/{f,g}
 mv $DIR/R2/f $DIR/R2/g
@@ -334,7 +339,7 @@ pass
 $CLEAN
 $START
 
-echo '-- test 24-R3: mkdir a  ; rename a b;'
+log '-- test 24-R3: mkdir a  ; rename a b;'
 mkdir $DIR/R3
 mkdir $DIR/R3/f
 mv $DIR/R3/f $DIR/R3/g
@@ -344,7 +349,7 @@ pass
 $CLEAN
 $START
 
-echo '-- test 24-R4: mkdir a b ; rename a b;'
+log '-- test 24-R4: mkdir a b ; rename a b;'
 mkdir $DIR/R4
 mkdir $DIR/R4/{f,g}
 perl -e "rename \"$DIR/R4/f\", \"$DIR/R4/g\";"
@@ -355,7 +360,7 @@ $CLEAN
 $START
 
 echo '-- cross directory renames --' 
-echo '-- test 24-R5: touch a ; rename a b'
+log '-- test 24-R5: touch a ; rename a b'
 mkdir $DIR/R5{a,b}
 touch $DIR/R5a/f
 mv $DIR/R5a/f $DIR/R5b/g
@@ -365,7 +370,7 @@ pass
 $CLEAN
 $START
 
-echo '-- test 24-R6: touch a ; rename a b'
+log '-- test 24-R6: touch a ; rename a b'
 mkdir $DIR/R6{a,b}
 touch $DIR/R6a/f $DIR/R6b/g
 mv $DIR/R6a/f $DIR/R6b/g
@@ -375,7 +380,7 @@ pass
 $CLEAN
 $START
 
-echo '-- test 24-R7: touch a ; rename a b'
+log '-- test 24-R7: touch a ; rename a b'
 mkdir $DIR/R7{a,b}
 mkdir $DIR/R7a/f
 mv $DIR/R7a/f $DIR/R7b/g
@@ -385,7 +390,7 @@ pass
 $CLEAN
 $START
 
-echo '-- test 24-R8: touch a ; rename a b'
+log '-- test 24-R8: touch a ; rename a b'
 mkdir $DIR/R8{a,b}
 mkdir $DIR/R8a/f $DIR/R8b/g
 perl -e "rename \"$DIR/R8a/f\", \"$DIR/R8b/g\";"
@@ -396,7 +401,7 @@ $CLEAN
 $START
 
 echo "-- rename error cases"
-echo "-- test 24-R9 target error: touch f ; mkdir a ; rename f a"
+log "-- test 24-R9 target error: touch f ; mkdir a ; rename f a"
 mkdir $DIR/R9
 mkdir $DIR/R9/a
 touch $DIR/R9/f
@@ -408,7 +413,7 @@ pass
 $CLEAN
 $START
 
-echo "--test 24-R10 source does not exist" 
+log "--test 24-R10 source does not exist" 
 mkdir $DIR/R10
 perl -e "rename \"$DIR/R10/f\", \"$DIR/R10/g\"" 
 $CHECKSTAT -t dir $DIR/R10 || error
@@ -419,7 +424,7 @@ $CLEAN
 $START
 
 echo '== symlink sanity ================================ test25'
-echo "--test 25.1 create file in symlinked directory"
+log "--test 25.1 create file in symlinked directory"
 mkdir $DIR/d25
 ln -s d25 $DIR/s25
 touch $DIR/s25/foo
@@ -427,13 +432,13 @@ pass
 $CLEAN
 $START
 
-echo "--test 25.2 lookup file in symlinked directory"
+log "--test 25.2 lookup file in symlinked directory"
 $CHECKSTAT -t file $DIR/s25/foo
 pass
 $CLEAN
 $START
 
-echo "--test 26 multiple component symlink"
+log "--test 26 multiple component symlink"
 mkdir $DIR/d26
 mkdir $DIR/d26/d26-2
 ln -s d26/d26-2 $DIR/s26
@@ -442,14 +447,14 @@ pass
 $CLEAN
 $START
 
-echo "--test 26.1 multiple component symlink at the end of a lookup"
+log "--test 26.1 multiple component symlink at the end of a lookup"
 ln -s d26/d26-2/foo $DIR/s26-2
 touch $DIR/s26-2
 pass
 $CLEAN
 $START
 
-echo "--test 26.2 a chain of symlinks"
+log "--test 26.2 a chain of symlinks"
 mkdir $DIR/d26.2
 touch $DIR/d26.2/foo
 ln -s d26.2 $DIR/s26.2-1
@@ -461,39 +466,39 @@ $CLEAN
 $START
 
 # recursive symlinks (bug 439)
-echo "--test 26.3 create multiple component recursive symlink"
+log "--test 26.3 create multiple component recursive symlink"
 ln -s d26-3/foo $DIR/d26-3
 pass
 $CLEAN
 $START
 
-echo "--test 26.3 unlink multiple component recursive symlink"
+log "--test 26.3 unlink multiple component recursive symlink"
 rm $DIR/d26-3
 pass
 $CLEAN
 $START
 
 echo '== stripe sanity ================================= test27'
-echo "--test 27.1 create one stripe"
+log "--test 27.1 create one stripe"
 mkdir $DIR/d27
 $LSTRIPE $DIR/d27/f0 8192 0 1
 $CHECKSTAT -t file $DIR/d27/f0
-echo "--test 27.2 write to one stripe file"
+log "--test 27.2 write to one stripe file"
 cp /etc/hosts $DIR/d27/f0
 pass
 
-echo "--test 27.3 create two stripe file f01"
+log "--test 27.3 create two stripe file f01"
 $LSTRIPE $DIR/d27/f01 8192 0 2
-echo "--test 27.4 write to two stripe file file f01"
+log "--test 27.4 write to two stripe file file f01"
 dd if=/dev/zero of=$DIR/d27/f01 bs=4k count=4
 pass
 
-echo "--test 27.5 create file with default settings"
+log "--test 27.5 create file with default settings"
 $LSTRIPE $DIR/d27/fdef 0 -1 0
 $CHECKSTAT -t file $DIR/d27/fdef
 #dd if=/dev/zero of=$DIR/d27/fdef bs=4k count=4
 
-echo "--test 27.6 lstripe existing file (should return error)"
+log "--test 27.6 lstripe existing file (should return error)"
 $LSTRIPE $DIR/d27/f12 8192 1 2
 ! $LSTRIPE $DIR/d27/f12 8192 1 2
 $CHECKSTAT -t file $DIR/d27/f12
@@ -501,32 +506,32 @@ $CHECKSTAT -t file $DIR/d27/f12
 pass
 
 
-echo "--test 27.7 lstripe with bad stripe size (should return error on LOV)"
+log "--test 27.7 lstripe with bad stripe size (should return error on LOV)"
 $LSTRIPE $DIR/d27/fbad 100 1 2 || /bin/true
 dd if=/dev/zero of=$DIR/d27/f12 bs=4k count=4
 pass
 $CLEAN
 $START
 
-echo "--test 27.8 lfind "
+log "--test 27.8 lfind "
 $LFIND $DIR/d27
 pass
 $CLEAN
 $START
 
-echo '== create/mknod/mkdir with bad file types ======== test28'
+log '== create/mknod/mkdir with bad file types ======== test28'
 mkdir $DIR/d28
 $CREATETEST $DIR/d28/ct || error
 pass
 
-echo '== IT_GETATTR regression  ======================== test29'
-mkdir $MOUNT/d29
-touch $MOUNT/d29/foo
-ls -l $MOUNT/d29
-MDCDIR=${MDCDIR:-/proc/fs/lustre/ldlm/ldlm/MDC_MNT_localhost_mds1}
+log '== IT_GETATTR regression  ======================== test29'
+mkdir $DIR/d29
+touch $DIR/d29/foo
+ls -l $DIR/d29
+MDCDIR=${MDCDIR:-/proc/fs/lustre/ldlm/ldlm/MDC_*}
 LOCKCOUNTORIG=`cat $MDCDIR/lock_count`
 LOCKUNUSEDCOUNTORIG=`cat $MDCDIR/lock_unused_count`
-ls -l $MOUNT/d29
+ls -l $DIR/d29
 LOCKCOUNTCURRENT=`cat $MDCDIR/lock_count`
 LOCKUNUSEDCOUNTCURRENT=`cat $MDCDIR/lock_unused_count`
 if [ $LOCKCOUNTCURRENT -gt $LOCKCOUNTORIG ] || [ $LOCKUNUSEDCOUNTCURRENT -gt $LOCKUNUSEDCOUNTORIG ]; then
@@ -536,8 +541,18 @@ pass
 $CLEAN
 $START
 
-echo '== cleanup ============================================='
-rm -r $DIR/[Rdfs][1-9]*
+log '== run binary from Lustre (execve) =============== test30'
+cp `which ls` $DIR
+$DIR/ls /
+$CLEAN
+$START
+
+log '== open-unlink file ============================== test31'
+./openunlink $DIR/f31 $DIR/f31 || error
+pass
+
+log '== cleanup ============================================='
+rm -r $DIR/[Rdfs][1-9]* $DIR/ls
 
 echo '======================= finished ======================='
 exit
index f6370e3..edfa47b 100644 (file)
@@ -14,6 +14,7 @@
 #if 0
 #include <linux/extN_fs.h>
 #endif
+#include <liblustre.h>
 #include <linux/lustre_lib.h>
 #include <linux/obd.h>
 
diff --git a/lustre/tests/statone.c b/lustre/tests/statone.c
new file mode 100644 (file)
index 0000000..5250984
--- /dev/null
@@ -0,0 +1,60 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include <liblustre.h>
+#include <linux/lustre_lib.h>
+#include <linux/obd.h>
+
+int main(int argc, char **argv)
+{
+    struct obd_ioctl_data data;
+    char rawbuf[8192], parent[4096], *buf = rawbuf, *base, *t;
+    int max = sizeof(rawbuf), fd, offset, rc;
+
+    if (argc != 2) {
+        printf("usage: %s filename\n", argv[0]);
+        return 1;
+    }
+
+    base = argv[1];
+    t = strrchr(base, '/');
+    if (!t) {
+        strcpy(parent, ".");
+        offset = -1;
+    } else {
+        strncpy(parent, base, t - base);
+        offset = t - base - 1;
+    }
+
+    fd = open(parent, O_RDONLY);
+    if (fd < 0) {
+        printf("open(%s) error: %s\n", parent, strerror(errno));
+        exit(errno);
+    }
+
+    memset(&data, 0, sizeof(data));
+    data.ioc_version = OBD_IOCTL_VERSION;
+    data.ioc_len = sizeof(data);
+    if (offset >= 0)
+        data.ioc_inlbuf1 = base + offset + 2;
+    else
+        data.ioc_inlbuf1 = base;
+    data.ioc_inllen1 = strlen(data.ioc_inlbuf1) + 1;
+    
+    if (obd_ioctl_pack(&data, &buf, max)) {
+        printf("ioctl_pack failed.\n");
+        exit(1);
+    }
+    
+    rc = ioctl(fd, IOC_MDC_LOOKUP, buf);
+    if (rc < 0) {
+        printf("ioctl(%s/%s) error: %s\n", parent,
+               data.ioc_inlbuf1, strerror(errno));
+        exit(errno);
+    }
+
+    return 0;
+}
index 9fcb1ac..08732ff 100644 (file)
@@ -1,5 +1,6 @@
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -13,5 +14,5 @@ int main(int argc, char **argv)
         }
 
         mode = strtoul(argv[1], NULL, 8); 
-        return chmod(argv[2], mode);
+        return chmod(argv[2], mode) ? errno : 0;
 }
index a0ce1de..774398d 100644 (file)
@@ -33,6 +33,7 @@
 #include <getopt.h>
 #undef _GNU_SOURCE
 
+#include <liblustre.h>
 #include <linux/lustre_mds.h>
 
 static void usage(char *argv0, int status)
index 7a4b95e..599bd21 100644 (file)
@@ -59,9 +59,9 @@ h2elan () {
 
 # create nodes
 echo -n "adding NET for:"
-for NODE in `echo $MDSNODE $OSTNODES $CLIENTS | sort -u`; do
+for NODE in `echo $MDSNODE $OSTNODES $CLIENTS | tr -s " " "\n" | sort -u`; do
        echo -n " $NODE"
-       ${LMC} -m $config --add net --node $NODE --nid `h2$NETTYPE $NODE` --nettype elan || exit 1
+       ${LMC} -m $config --add net --node $NODE --nid `h2$NETTYPE $NODE` --nettype $NETTYPE || exit 1
 done
 
 # configure mds server
index 426602f..94ed749 100644 (file)
@@ -10,8 +10,9 @@
 #include <time.h>
 #include <limits.h>
 #include <sys/ioctl.h>
-#include <linux/lustre_lib.h>
+#include <liblustre.h>
 #include <linux/obd.h>
+#include <linux/lustre_lib.h>
 
 static int usage(char *prog, FILE *out)
 {
index fc0f010..4775289 100644 (file)
@@ -15,3 +15,4 @@ lconf
 obdstat
 obdio
 obdbarrier
+lload
index bfeebd7..d345b64 100644 (file)
@@ -4,12 +4,12 @@ DEFS=
 CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(PORTALS)/include  -I$(srcdir)/../include -Wall -L$(PORTALSLIB)
 KFLAGS:=
 CPPFLAGS = $(HAVE_LIBREADLINE)
-obdctl_LDADD := $(LIBREADLINE)
 lctl_LDADD := $(LIBREADLINE) -lptlctl
-sbin_PROGRAMS = lctl lfind lstripe obdctl obdio obdbarrier obdstat
+lload_LDADD := -lptlctl
+sbin_PROGRAMS = lctl lfind lstripe obdio obdbarrier obdstat lload
 sbin_SCRIPTS = lconf lmc llanalyze
-obdctl_SOURCES = parser.c obdctl.c obd.c parser.h obdctl.h
-lctl_SOURCES = parser.c obd.c lctl.c parser.h
+lctl_SOURCES = parser.c obd.c lctl.c parser.h obdctl.h
+lload_SOURCES = lload.c 
 obdio_SOURCES = obdio.c obdiolib.c obdiolib.h
 obdbarrier_SOURCES = obdbarrier.c obdiolib.c obdiolib.h
 lfind_SOURCES = lfind.c
index 796871d..cbe05dd 100755 (executable)
@@ -35,7 +35,6 @@ else:
     from fcntl import F_GETFL, F_SETFL
 
 # Global parameters
-TCP_ACCEPTOR = ''
 MAXTCPBUF = 1048576
 DEFAULT_TCPBUF = 1048576
 #
@@ -124,6 +123,7 @@ class Config:
         self._ldapurl = ''
         self._config_name = ''
         self._select = {}
+        self._lctl_dump = ''
 
     def verbose(self, flag = None):
         if flag: self._verbose = flag
@@ -220,6 +220,10 @@ class Config:
             return self._select[srv]
         return None
 
+    def lctl_dump(self, val = None):
+        if val: self._lctl_dump = val
+        return self._lctl_dump
+
 
 config = Config()
 
@@ -280,6 +284,104 @@ class LconfError (exceptions.Exception):
 
 
 # ============================================================
+# handle daemons, like the acceptor
+class DaemonHandler:
+    """ Manage starting and stopping a daemon. Assumes daemon manages
+    it's own pid file. """
+
+    def __init__(self, cmd):
+        self.command = cmd
+        self.path =""
+
+    def start(self):
+        if self.running():
+            log(self.command, "already running.")
+        if not self.path:
+            self.path = find_prog(self.command)
+            if not self.path:
+                panic(self.command, "not found.")
+        ret, out = runcmd(self.path +' '+ self.command_line())
+        if ret:
+            raise CommandError(self.path, out, ret)
+
+    def stop(self):
+        if self.running():
+            pid = self.read_pidfile()
+            try:
+                log ("killing process", pid)
+                os.kill(pid, 15)
+                #time.sleep(1) # let daemon die
+            except OSError, e:
+                log("unable to kill", self.command, e)
+            if self.running():
+                log("unable to kill", self.command)
+
+    def running(self):
+        pid = self.read_pidfile()
+        if pid:
+            try:
+                os.kill(pid, 0)
+            except OSError:
+                self.clean_pidfile()
+            else:
+                return 1
+        return 0
+
+    def read_pidfile(self):
+        try:
+            fp = open(self.pidfile(), 'r')
+            pid = int(fp.read())
+            fp.close()
+            return pid
+        except IOError:
+            return 0
+        
+    def clean_pidfile(self):
+        """ Remove a stale pidfile """
+        log("removing stale pidfile:", self.pidfile())
+        try:
+            os.unlink(self.pidfile())
+        except OSError, e:
+            log(self.pidfile(), e)
+            
+class AcceptorHandler(DaemonHandler):
+    def __init__(self, port, net_type, send_mem, recv_mem, irq_aff, nid_xchg):
+        DaemonHandler.__init__(self, "acceptor")
+        self.port = port
+        self.flags = ''
+        self.send_mem = send_mem
+        self.recv_mem = recv_mem
+
+        if net_type == 'toe':
+            self.flags = self.flags + ' -N 4'
+        if irq_aff:
+            self.flags = self.flags + ' -i'
+        if nid_xchg:
+            self.flags = self.flags + ' -x'
+
+    def pidfile(self):
+        return "/var/run/%s-%d.pid" % (self.command, self.port)
+
+    def command_line(self):
+        return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
+    
+acceptors = {}
+
+# start the acceptors
+def run_acceptors():
+    for port in acceptors.keys():
+        daemon = acceptors[port]
+        if not daemon.running():
+            daemon.start()
+
+def stop_acceptor(port):
+    if acceptors.has_key(port):
+        daemon = acceptors[port]
+        if daemon.running():
+            daemon.stop()
+        
+
+# ============================================================
 # handle lctl interface
 class LCTLInterface:
     """
@@ -291,6 +393,7 @@ class LCTLInterface:
         Initialize close by finding the lctl binary.
         """
         self.lctl = find_prog(cmd)
+        self.save_file = ''
         if not self.lctl:
             if config.noexec():
                 debug('! lctl not found')
@@ -298,6 +401,9 @@ class LCTLInterface:
             else:
                 raise CommandError('lctl', "unable to find lctl binary.")
 
+    def use_save_file(self, file):
+        self.save_file = file
+        
     def set_nonblock(self, fd):
         fl = fcntl.fcntl(fd, F_GETFL)
         fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
@@ -311,10 +417,14 @@ class LCTLInterface:
         should modify command line to accept multiple commands, or
         create complex command line options
         """
-        debug("+", self.lctl, cmds)
+        cmd_line = self.lctl
+        if self.save_file:
+            cmds = '\n  dump ' + self.save_file + cmds
+
+        debug("+", cmd_line, cmds)
         if config.noexec(): return (0, [])
 
-        child = popen2.Popen3(self.lctl, 1) # Capture stdout and stderr from command
+        child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
         child.tochild.write(cmds + "\n")
         child.tochild.close()
 
@@ -370,33 +480,28 @@ class LCTLInterface:
             cmds =  """
   network %s
   mynid %s
-  add_uuid self %s
-  quit""" % (net, nid, nid)
-        else:
-            cmds =  """
-  network %s
-  add_uuid self %s
-  quit""" % (net, nid)
-            
-        self.run(cmds)
+  quit """ % (net, nid)
+            self.run(cmds)
 
     # create a new connection
-    def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
-        if net  in ('tcp', 'toe'):
-            cmds =  """
+    def connect(self, srv):
+        cmds =  "\n  add_uuid %s %s %s" % (srv.uuid, srv.nid, srv.net_type)
+        if srv.net_type  in ('tcp', 'toe') and not config.lctl_dump():
+            flags = ''
+            if srv.irq_affinity:
+                flags = flags + 'i'
+            if srv.nid_exchange:
+                flags = flags + 'x'
+            cmds =  """%s          
   network %s
-  add_uuid %s %s
   send_mem %d
   recv_mem %d
-  connect %s %d
-  quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port,  )
-        else:
-            cmds =  """
-  network %s
-  add_uuid %s %s
-  connect %s %d
-  quit""" % (net, servuuid, nid, nid, port,  )
-            
+  connect %s %d %s""" % (cmds, srv.net_type,
+             srv.send_mem,
+             srv.recv_mem,
+             srv.hostaddr, srv.port, flags )
+
+        cmds = cmds + "\n  quit"
         self.run(cmds)
                 
     # add a route to a range
@@ -404,7 +509,8 @@ class LCTLInterface:
         cmds =  """
   network %s
   add_route %s %s %s
-  quit  """ % (net, gw, lo, hi)
+  quit  """ % (net,
+               gw, lo, hi)
         self.run(cmds)
 
                 
@@ -420,9 +526,11 @@ class LCTLInterface:
     def add_route_host(self, net, uuid, gw, tgt):
         cmds =  """
   network %s
-  add_uuid %s %s
+  add_uuid %s %s %s
   add_route %s %s
-  quit """ % (net, uuid, tgt, gw, tgt)
+  quit """ % (net,
+              uuid, tgt, net,
+              gw, tgt)
         self.run(cmds)
 
     # add a route to a range
@@ -450,7 +558,6 @@ class LCTLInterface:
         cmds =  """
   ignore_errors
   network %s
-  del_uuid self
   disconnect
   quit""" % (net)
         self.run(cmds)
@@ -507,8 +614,7 @@ class LCTLInterface:
 # Run a command and return the output and status.
 # stderr is sent to /dev/null, could use popen3 to
 # save it if necessary
-def run(*args):
-    cmd = string.join(map(str,args))
+def runcmd(cmd):
     debug ("+", cmd)
     if config.noexec(): return (0, [])
     f = os.popen(cmd + ' 2>&1')
@@ -520,6 +626,10 @@ def run(*args):
         ret = 0
     return (ret, out)
 
+def run(*args):
+    cmd = string.join(map(str,args))
+    return runcmd(cmd)
+
 # Run a command in the background.
 def run_daemon(*args):
     cmd = string.join(map(str,args))
@@ -540,7 +650,7 @@ def find_prog(cmd):
     cmdpath = os.path.dirname(sys.argv[0])
     syspath.insert(0, cmdpath);
     if config.portals_dir():
-        syspath.insert(0, os.path.join(cmdpath, config.portals_dir()+'/linux/utils/'))
+        syspath.insert(0, os.path.join(config.portals_dir()+'/linux/utils/'))
     for d in syspath:
         prog = os.path.join(d,cmd)
         if os.access(prog, os.X_OK):
@@ -696,6 +806,16 @@ def if2addr(iface):
     ip = string.split(addr, ':')[1]
     return ip
 
+def get_local_nid(net_type, wildcard):
+    """Return the local nid. First look for an elan interface,
+      then use the local address. """
+    local = ""
+    if os.access('/proc/elan/device0/position', os.R_OK):
+        local = get_local_address('elan', '*')
+    else:
+        local = get_local_address(net_type, wildcard)
+    return local
+        
 def get_local_address(net_type, wildcard):
     """Return the local address for the network type."""
     local = ""
@@ -730,6 +850,8 @@ def is_prepared(uuid):
     """Return true if a device exists for the uuid"""
     # expect this format:
     # 1 UP ldlm ldlm ldlm_UUID 2
+    if config.lctl_dump():
+        return 0
     try:
         out = lctl.device_list()
         for s in out:
@@ -738,6 +860,21 @@ def is_prepared(uuid):
     except CommandError, e:
         e.dump()
     return 0
+
+def is_network_prepared():
+    """If the  PTLRPC device exists, then assumet that all networking
+       has been configured"""
+    if config.lctl_dump():
+        return 0
+    try:
+        out = lctl.device_list()
+        for s in out:
+            if 'RPCDEV_UUID' == string.split(s)[4]:
+                return 1
+    except CommandError, e:
+        e.dump()
+    return 0
+    
     
 def fs_is_mounted(path):
     """Return true if path is a mounted lustre filesystem"""
@@ -774,34 +911,16 @@ class Module:
         msg = string.join(map(str,args))
         print self.module_name + ":", self.name, self.uuid, msg
 
-    def lookup_server(self, srv_uuid):
-        """ Lookup a server's network information """
-        net = self.db.get_ost_net(srv_uuid)
-        if not net:
-            panic ("Unable to find a server for:", srv_uuid)
-        self._server = Network(net)
-
-    def get_server(self):
-        return self._server
-
     def cleanup(self):
         """ default cleanup, used for most modules """
         self.info()
-        srv = self.get_server()
-        if srv and local_net(srv):
-            try:
-                lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
-            except CommandError, e:
-                log(self.module_name, "disconnect failed: ", self.name)
-                e.dump()
-                cleanup_error(e.rc)
         try:
             lctl.cleanup(self.name, self.uuid)
         except CommandError, e:
             log(self.module_name, "cleanup failed: ", self.name)
             e.dump()
             cleanup_error(e.rc)
-
+            
     def add_portals_module(self, dev_dir, modname):
         """Append a module to list of modules to load."""
         self.kmodule_list.append((config.portals_dir(), dev_dir, modname))
@@ -857,20 +976,31 @@ class Module:
                 log('! unable to unload module:', mod)
                 logall(out)
         
-
 class Network(Module):
     def __init__(self,db):
         Module.__init__(self, 'NETWORK', db)
         self.net_type = self.db.get_val('nettype')
         self.nid = self.db.get_val('nid', '*')
         self.port = self.db.get_val_int('port', 0)
-        self.send_mem = self.db.get_val_int('send_mem', DEFAULT_TCPBUF)
-        self.recv_mem = self.db.get_val_int('recv_mem', DEFAULT_TCPBUF)
+        self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
+        self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
+        self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
+        self.nid_exchange = self.db.get_val_int('nidexchange', 0)
+
         if '*' in self.nid:
-            self.nid = get_local_address(self.net_type, self.nid)
+            self.nid = get_local_nid(self.net_type, self.nid)
             if not self.nid:
                 panic("unable to set nid for", self.net_type, self.nid)
             debug("nid:", self.nid)
+
+        self.hostaddr = self.db.get_val('hostaddr', self.nid)
+        if '*' in self.hostaddr:
+            self.hostaddr = get_local_address(self.net_type, self.hostaddr)
+            if not self.nid:
+                panic("unable to set nid for", self.net_type, self.hostaddr)
+            debug("hostaddr:", self.hostaddr)
+        # debug ( "hostaddr ", self.hostaddr, "net_type", self.net_type)
+
         self.add_portals_module("linux/oslib", 'portals')
         if node_needs_router():
             self.add_portals_module("linux/router", 'kptlrouter')
@@ -883,37 +1013,45 @@ class Network(Module):
         if self.net_type == 'gm':
             self.add_portals_module("/linux/gmnal", 'kgmnal')
         self.add_lustre_module('obdclass', 'obdclass')
-        self.add_lustre_module('ptlrpc', 'ptlrpc')
 
     def prepare(self):
+        if is_network_prepared():
+            return
+        self.info(self.net_type, self.nid, self.port)
+        lctl.network(self.net_type, self.nid)
+
+    def cleanup(self):
         self.info(self.net_type, self.nid, self.port)
         if self.net_type in ('tcp', 'toe'):
-            nal_id = '' # default is socknal
-            if self.net_type == 'toe':
-                nal_id = '-N 4'
-            ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
-            if ret:
-                raise CommandError(TCP_ACCEPTOR, out, ret)
+            stop_acceptor(self.port)
+        try:
+            lctl.disconnectAll(self.net_type)
+        except CommandError, e:
+            print "disconnectAll failed: ", self.name
+            e.dump()
+            cleanup_error(e.rc)
+
+class Router(Module):
+    def __init__(self,db):
+        Module.__init__(self, 'ROUTER', db)
+    def prepare(self):
+        if is_network_prepared():
+            return
+        self.info()
         for net_type, gw, lo, hi in self.db.get_route_tbl():
             lctl.add_route(net_type, gw, lo, hi)
-            if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '':
-                srvdb = self.db.nid2server(lo)
+            if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '':
+                srvdb = self.db.nid2server(lo, net_type)
+
                 if not srvdb:
                     panic("no server for nid", lo)
                 else:
                     srv = Network(srvdb)
-                    lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
-
-            
-        lctl.network(self.net_type, self.nid)
-        if not is_prepared("RPCDEV_UUID"):
-            lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID")
-
+                    lctl.connect(srv)
     def cleanup(self):
-        self.info(self.net_type, self.nid, self.port)
         for net_type, gw, lo, hi in self.db.get_route_tbl():
-            if self.net_type in ('tcp', 'toe') and hi == '':
-                srvdb = self.db.nid2server(lo)
+            if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '':
+                srvdb = self.db.nid2server(lo, net_type)
                 if not srvdb:
                     panic("no server for nid", lo)
                 else:
@@ -925,28 +1063,11 @@ class Network(Module):
                         e.dump()
                         cleanup_error(e.rc)
             try:
-                lctl.del_route(self.net_type, self.nid, lo, hi)
+                lctl.del_route(net_type, gw, lo, hi)
             except CommandError, e:
                 print "del_route failed: ", self.name
                 e.dump()
                 cleanup_error(e.rc)
-              
-        try:
-            if is_prepared("RPCDEV_UUID"):
-                lctl.cleanup("RPCDEV", "RPCDEV_UUID")
-        except CommandError, e:
-            print "cleanup failed: RPCDEV"
-            e.dump()
-            cleanup_error(e.rc)
-        try:
-            lctl.disconnectAll(self.net_type)
-        except CommandError, e:
-            print "disconnectAll failed: ", self.name
-            e.dump()
-            cleanup_error(e.rc)
-        if self.net_type in ('tcp', 'toe'):
-            # yikes, this ugly! need to save pid in /var/something
-            run("killall acceptor")
 
 class LDLM(Module):
     def __init__(self,db):
@@ -956,8 +1077,23 @@ class LDLM(Module):
         if is_prepared(self.uuid):
             return
         self.info()
-        lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
-                    setup ="")
+        lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid))
+    def cleanup(self):
+        if is_prepared(self.uuid):
+            Module.cleanup(self)
+
+class PTLRPC(Module):
+    def __init__(self,db):
+        Module.__init__(self, 'PTLRPC', db)
+        self.add_lustre_module('ptlrpc', 'ptlrpc') 
+    def prepare(self):
+        if is_prepared(self.uuid):
+            return
+        self.info()
+        lctl.newdev(attach="ptlrpc %s %s" % (self.name, self.uuid))
+    def cleanup(self):
+        if is_prepared(self.uuid):
+            Module.cleanup(self)
 
 class LOV(Module):
     def __init__(self,db):
@@ -973,6 +1109,7 @@ class LOV(Module):
         self.devlist = self.db.get_refs('obd')
         self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
         self.osclist = []
+        self.mdc_uudi = ''
         for obd_uuid in self.devlist:
             obd = self.db.lookup(obd_uuid)
             osc = get_osc(obd, self.name)
@@ -1039,27 +1176,46 @@ class LOVConfig(Module):
 class MDSDEV(Module):
     def __init__(self,db):
         Module.__init__(self, 'MDSDEV', db)
-        self.devname = self.db.get_val('devpath','')
+        self.devpath = self.db.get_val('devpath','')
         self.size = self.db.get_val_int('devsize', 0)
         self.fstype = self.db.get_val('fstype', '')
         # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
-        self.uuid = self.db.get_first_ref('target')
-        mds = self.db.lookup(self.uuid)
+        target_uuid = self.db.get_first_ref('target')
+        mds = self.db.lookup(target_uuid)
         self.name = mds.getName()
         self.lovconfig_uuids = mds.get_refs('lovconfig')
         # FIXME: if fstype not set, then determine based on kernel version
         self.format = self.db.get_val('autoformat', "no")
+
+        active_uuid = mds.get_active_target()
+        if not active_uuid:
+            panic("No target device found:", target_uuid)
+        if active_uuid == self.uuid:
+            self.active = 1
+        else:
+            self.active = 0
+        self.target_dev_uuid = self.uuid
+        self.uuid = target_uuid
+        # modules
         if self.fstype == 'extN':
             self.add_lustre_module('extN', 'extN') 
         self.add_lustre_module('mds', 'mds')
         if self.fstype:
             self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype))
+
+    def load_module(self):
+        if self.active:
+            Module.load_module(self)
             
     def prepare(self):
         if is_prepared(self.uuid):
             return
-        self.info(self.devname, self.fstype, self.format)
-        blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
+        if not self.active:
+            debug(self.uuid, "not active")
+            return
+        self.info(self.devpath, self.fstype, self.format)
+        run_acceptors()
+        blkdev = block_dev(self.devpath, self.size, self.fstype, self.format)
         if not is_prepared('MDT_UUID'):
             lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
                         setup ="")
@@ -1080,38 +1236,57 @@ class MDSDEV(Module):
                 cleanup_error(e.rc)
         if is_prepared(self.uuid):
             Module.cleanup(self)
-        clean_loop(self.devname)
+        clean_loop(self.devpath)
 
 class OSD(Module):
     def __init__(self, db):
         Module.__init__(self, 'OSD', db)
         self.osdtype = self.db.get_val('osdtype')
-        self.devname = self.db.get_val('devpath', '')
+        self.devpath = self.db.get_val('devpath', '')
         self.size = self.db.get_val_int('devsize', 0)
         self.fstype = self.db.get_val('fstype', '')
-        self.uuid = self.db.get_first_ref('target')
-        ost = self.db.lookup(self.uuid)
+        target_uuid = self.db.get_first_ref('target')
+        ost = self.db.lookup(target_uuid)
         self.name = ost.getName()
         # FIXME: if fstype not set, then determine based on kernel version
         self.format = self.db.get_val('autoformat', 'yes')
         if self.fstype == 'extN':
             self.add_lustre_module('extN', 'extN') 
+
+        active_uuid = ost.get_active_target()
+        if not active_uuid:
+            panic("No target device found:", target_uuid)
+        if active_uuid == self.uuid:
+            self.active = 1
+        else:
+            self.active = 0
+        self.target_dev_uuid = self.uuid
+        self.uuid = target_uuid
+        # modules
         self.add_lustre_module('ost', 'ost')
         self.add_lustre_module(self.osdtype, self.osdtype)
         if self.fstype:
             self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype))
 
+    def load_module(self):
+        if self.active:
+            Module.load_module(self)
+
     # need to check /proc/mounts and /etc/mtab before
     # formatting anything.
     # FIXME: check if device is already formatted.
     def prepare(self):
         if is_prepared(self.uuid):
             return
-        self.info(self.osdtype, self.devname, self.size, self.fstype, self.format)
+        if not self.active:
+            debug(self.uuid, "not active")
+            return
+        self.info(self.osdtype, self.devpath, self.size, self.fstype, self.format)
+        run_acceptors()
         if self.osdtype == 'obdecho':
             blkdev = ''
         else:
-            blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
+            blkdev = block_dev(self.devpath, self.size, self.fstype, self.format)
         lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid),
                     setup ="%s %s" %(blkdev, self.fstype))
         if not is_prepared('OSS_UUID'):
@@ -1129,81 +1304,94 @@ class OSD(Module):
         if is_prepared(self.uuid):
             Module.cleanup(self)
         if not self.osdtype == 'obdecho':
-            clean_loop(self.devname)
+            clean_loop(self.devpath)
 
 # Generic client module, used by OSC and MDC
 class Client(Module):
-    def __init__(self, db, module, owner, target_name, target_uuid):
-        self.target_name = target_name
-        self.target_uuid = target_uuid
-        self.db = db
-        node_name =  config.select(target_name)
-        if node_name:
-            self.tgt_dev_uuid = self.db.get_target_device(node_name, target_uuid)
-        else:
-            self.tgt_dev_uuid = db.get_first_ref('active')
+    def __init__(self, tgtdb, module, owner):
+        self.target_name = tgtdb.getName()
+        self.target_uuid = tgtdb.getUUID()
+        self.db = tgtdb
+
+        self.tgt_dev_uuid = tgtdb.get_active_target()
         if not self.tgt_dev_uuid:
-            panic("No target device found for target:", target_name)
+            panic("No target device found for target:", self.target_name)
+            
         self.kmodule_list = []
         self._server = None
         self._connected = 0
 
         self.module = module
         self.module_name = string.upper(module)
-        self.name = '%s_%s_%s' % (self.module_name, owner, target_name)
-        self.uuid = '%05x_%s_%05x' % (int(random.random() * 1048576), self.name,
-                                      int(random.random() * 1048576))
+        self.name = '%s_%s_%s' % (self.module_name, owner, self.target_name)
+        self.uuid = '%05x%05x_%.14s_%05x%05x' % (int(random.random() * 1048576),
+                                              int(random.random() * 1048576),self.name,
+                                              int(random.random() * 1048576),
+                                              int(random.random() * 1048576))
         self.uuid = self.uuid[0:36]
         self.lookup_server(self.tgt_dev_uuid)
         self.add_lustre_module(module, module)
 
+    def lookup_server(self, srv_uuid):
+        """ Lookup a server's network information """
+        self._server_nets = self.db.get_ost_net(srv_uuid)
+        if len(self._server_nets) == 0:
+            panic ("Unable to find a server for:", srv_uuid)
+
+    def get_servers(self):
+        return self._server_nets
+
     def prepare(self, ignore_connect_failure = 0):
         if is_prepared(self.uuid):
             return
         self.info(self.target_uuid)
-        srv = self.get_server()
         try:
-            if local_net(srv):
-                #debug("LOCAL NET")
-                lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
+            srv = local_net(self.get_servers())
+            if srv:
+                lctl.connect(srv)
             else:
-                #debug("NOT LOCAL NET")
-                r =  find_route(srv)
-                if r:
+                srv, r =  find_route(self.get_servers())
+                if srv:
                     lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
                 else:
-                    panic ("no route to",  srv.nid)
+                    panic ("no route to",  self.target_uuid)
         except CommandError:
             if (ignore_connect_failure == 0):
                 pass
-        lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid),
+        if srv:
+            lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid),
                         setup ="%s %s" %(self.target_uuid, srv.uuid))
 
     def cleanup(self):
-        srv = self.get_server()
-        if local_net(srv):
-            Module.cleanup(self)
+        Module.cleanup(self)
+        srv = local_net(self.get_servers())
+        if srv:
+            try:
+                lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
+            except CommandError, e:
+                log(self.module_name, "disconnect failed: ", self.name)
+                e.dump()
+                cleanup_error(e.rc)
         else:
-            self.info(self.targt_uuid)
-            r =  find_route(srv)
-            if r:
+            self.info(self.target_uuid)
+            srv, r =  find_route(self.get_servers())
+            if srv:
                 try:
                     lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
                 except CommandError, e:
                     print "del_route failed: ", self.name
                     e.dump()
                     cleanup_error(e.rc)
-            Module.cleanup(self)
 
 
 
 class MDC(Client):
-    def __init__(self, db, owner, target_name, target_uuid):
-         Client.__init__(self, db, 'mdc', owner, target_name, target_uuid)
+    def __init__(self, db, owner):
+         Client.__init__(self, db, 'mdc', owner)
 
 class OSC(Client):
-    def __init__(self, db, owner, target_name, target_uuid):
-         Client.__init__(self, db, 'osc', owner, target_name, target_uuid)
+    def __init__(self, db, owner):
+         Client.__init__(self, db, 'osc', owner)
 
             
 class COBD(Module):
@@ -1299,6 +1487,8 @@ class Mountpoint(Module):
             mdc_uuid = prepare_mdc(self.db, self.name,  self.mds_uuid)
         else:
             mdc_uuid = self.vosc.get_mdc_uuid()
+        if not mdc_uuid:
+            panic("Unable to determine MDC UUID. Probably need to cleanup before re-mounting.")
         self.info(self.path, self.mds_uuid, self.obd_uuid)
         cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
               (self.vosc.get_uuid(), mdc_uuid, self.path)
@@ -1387,17 +1577,25 @@ class LustreDB:
         uuids = self._get_all_refs()
         return uuids
 
-    def get_ost_net(self, uuid):
-        ost = self.lookup(uuid)
-        uuid = ost.get_first_ref('network')
-        if not uuid:
-            return None
-        return ost.lookup(uuid)
-
-    def nid2server(self, nid):
+    def get_ost_net(self, osd_uuid):
+        srv_list = []
+        if not osd_uuid:
+            return srv_list
+        osd = self.lookup(osd_uuid)
+        node_uuid = osd.get_first_ref('node')
+        node = self.lookup(node_uuid)
+        if not node:
+            panic("unable to find node for osd_uuid:", osd_uuid,
+                  " node_ref:", node_uuid)
+        for net_uuid in node.get_networks():
+            db = node.lookup(net_uuid)
+            srv_list.append(Network(db))
+        return srv_list
+
+    def nid2server(self, nid, net_type):
         netlist = self.lookup_class('network')
         for net_db in netlist:
-            if net_db.get_val('nid') == nid: 
+            if net_db.get_val('nid') == nid and net_db.get_val('nettype') == net_type
                 return net_db
         return None
     
@@ -1411,7 +1609,11 @@ class LustreDB:
         type = self.get_class()
         ret=0;
         if type in ('network',):
-            ret = 10
+            ret = 5
+        elif type in ('routetbl',):
+            ret = 6
+        elif type in ('ptlrpc',):
+            ret = 7
         elif type in ('device', 'ldlm'):
             ret = 20
         elif type in ('osd', 'mdd', 'cobd'):
@@ -1448,7 +1650,7 @@ class LustreDB:
 
     # Find the target_device for target on a node
     # node->profiles->device_refs->target
-    def get_target_device(self, node_name, target_uuid):
+    def get_target_device(self, target_uuid, node_name):
         node_db = self.lookup_name(node_name)
         if not node_db:
             return None
@@ -1462,6 +1664,17 @@ class LustreDB:
                     return ref[1]
         return None
 
+    def get_active_target(self):
+        target_uuid = self.getUUID()
+        target_name = self.getName()
+        node_name = config.select(target_name)
+        if node_name:
+            tgt_dev_uuid = self.get_target_device(target_uuid, node_name)
+        else:
+            tgt_dev_uuid = self.get_first_ref('active')
+        return tgt_dev_uuid
+        
+
     # get all network uuids for this node
     def get_networks(self):
         ret = []
@@ -1469,7 +1682,7 @@ class LustreDB:
         for prof_uuid in prof_list:
             prof_db = self.lookup(prof_uuid)
             net_list = prof_db.get_refs('network')
-            debug("get_networks():", prof_uuid, net_list)
+            #debug("get_networks():", prof_uuid, net_list)
             for net_uuid in net_list:
                 ret.append(net_uuid)
         return ret
@@ -1589,21 +1802,21 @@ class LustreDB_XML(LustreDB):
         for t in tbl:
             routes = t.getElementsByTagName('route')
             for r in routes:
-                lo = self.xmlattr(r, 'lo')
-                hi = self.xmlattr(r, 'hi')
-                res.append((type, gw, lo, hi))
+                net_type = self.xmlattr(r, 'type')
+                if type != net_type:
+                    lo = self.xmlattr(r, 'lo')
+                    hi = self.xmlattr(r, 'hi')
+                    res.append((type, gw, lo, hi))
         return res
 
     def get_route_tbl(self):
         ret = []
-        tbls = self.dom_node.getElementsByTagName('routetbl')
-        for tbl in tbls:
-            for r in tbl.getElementsByTagName('route'):
-                net_type = self.xmlattr(r, 'type')
-                gw = self.xmlattr(r, 'gw')
-                lo = self.xmlattr(r, 'lo')
-                hi = self.xmlattr(r, 'hi')
-                ret.append((net_type, gw, lo, hi))
+        for r in self.dom_node.getElementsByTagName('route'):
+            net_type = self.xmlattr(r, 'type')
+            gw = self.xmlattr(r, 'gw')
+            lo = self.xmlattr(r, 'lo')
+            hi = self.xmlattr(r, 'hi')
+            ret.append((net_type, gw, lo, hi))
         return ret
 
 
@@ -1738,14 +1951,14 @@ class LustreDB_LDAP(LustreDB):
 # OSC is no longer in the xml, so we have to fake it.
 # this is getting ugly and begging for another refactoring
 def get_osc(ost_db, owner):
-    osc = OSC(ost_db, owner, ost_db.getName(), ost_db.getUUID())
+    osc = OSC(ost_db, owner)
     return osc
 
 def get_mdc(db, owner, mds_uuid):
     mds_db = db.lookup(mds_uuid);
     if not mds_db:
         panic("no mds:", mds_uuid)
-    mdc = MDC(mds_db, owner, mds_db.getName(), mds_uuid)
+    mdc = MDC(mds_db, owner)
     return mdc
 
 def prepare_mdc(db, owner, mds_uuid):
@@ -1767,11 +1980,18 @@ router_flag = 0
 
 def add_local_interfaces(node_db):
     global local_node
-    debug("add_local")
     for netuuid in node_db.get_networks():
         net = node_db.lookup(netuuid)
+        srv = Network(net)
         debug("add_local", netuuid)
-        local_node.append((net.get_val('nettype'), net.get_val('nid')))
+        local_node.append((srv.net_type, srv.nid))
+        if acceptors.has_key(srv.port):
+            panic("duplicate port:", srv.port)
+        if srv.net_type in ('tcp', 'toe'):
+            acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
+                                                  srv.send_mem, srv.recv_mem,
+                                                  srv.irq_affinity,
+                                                  srv.nid_exchange)
 
 def node_needs_router():
     return router_flag
@@ -1797,34 +2017,40 @@ def init_route_config(lustre):
                 #debug("init_route_config: gw is", gw)
                 if not gw:
                     continue
-                for netuuid in node_db.get_networks():
-                    db = node_db.lookup(netuuid)
-                    #debug("init_route_config: tbl: ", db.get_route_tbl())
-                    if local_type != db.get_val('nettype'):
-                        for route in db.get_routes(local_type, gw):
-                            routes.append(route)
-    #debug("init_route_config routes:", routes)
+                for route in node_db.get_routes(local_type, gw):
+                    routes.append(route)
+    debug("init_route_config routes:", routes)
+
 
+def local_net(srv_list):
+    global local_node
+    for iface in local_node:
+        for srv in srv_list:
+            #debug("local_net a:", srv.net_type, "b:", iface[0])
+            if srv.net_type == iface[0]:
+                return srv
+    return None
 
-def local_net(net):
+def local_net_type(net_type):
     global local_node
     for iface in local_node:
-        #debug("local_net a:", net.net_type, "b:", iface[0])
-        if net.net_type == iface[0]:
+        if net_type == iface[0]:
             return 1
     return 0
 
-def find_route(net):
+def find_route(srv_list):
     global local_node, routes
     frm_type = local_node[0][0]
-    to_type = net.net_type
-    to = net.nid
-    debug ('looking for route to', to_type,to)
-    for r in routes:
-        #debug("find_route: ", r)
-        if  r[2] == to:
-            return r
-    return None
+    for srv in srv_list:
+        #debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type)
+        to_type = srv.net_type
+        to = srv.hostaddr
+        #debug ('looking for route to', to_type, to)
+        for r in routes:
+            #debug("find_route: ", r)
+            if  r[2] == to:
+                return srv, r
+    return None,None
            
 
 ############################################################
@@ -1836,10 +2062,14 @@ def newService(db):
     n = None
     if type == 'ldlm':
         n = LDLM(db)
+    elif type == 'ptlrpc':
+        n = PTLRPC(db)
     elif type == 'lov':
         n = LOV(db)
     elif type == 'network':
         n = Network(db)
+    elif type == 'routetbl':
+        n = Router(db)
     elif type == 'osd':
         n = OSD(db)
     elif type == 'cobd':
@@ -1871,22 +2101,30 @@ def for_each_profile(db, prof_list, operation):
         operation(services)
         
 def doSetup(services):
+    if config.nosetup():
+        return
     for s in services:
         n = newService(s[1])
         n.prepare()
     
 def doModules(services):
+    if config.nomod():
+        return
     for s in services:
         n = newService(s[1])
         n.load_module()
 
 def doCleanup(services):
+    if config.nosetup():
+        return
     services.reverse()
     for s in services:
         n = newService(s[1])
         n.cleanup()
 
 def doUnloadModules(services):
+    if config.nomod():
+        return
     services.reverse()
     for s in services:
         n = newService(s[1])
@@ -1910,8 +2148,8 @@ def doHost(lustreDB, hosts):
     recovery_upcall = node_db.get_val('recovery_upcall', '')
     timeout = node_db.get_val_int('timeout', 0)
 
+    add_local_interfaces(node_db)
     if not router_flag:
-        add_local_interfaces(node_db)
         init_route_config(lustreDB)
 
     # Two step process: (1) load modules, (2) setup lustre
@@ -1922,6 +2160,11 @@ def doHost(lustreDB, hosts):
         if config.force():
             # the command line can override this value
             timeout = 5
+        # ugly hack, only need to run lctl commands for --dump
+        if config.lctl_dump():
+            for_each_profile(node_db, prof_list, doCleanup)
+            return
+
         sys_set_timeout(timeout)
         sys_set_recovery_upcall(recovery_upcall)
 
@@ -1929,6 +2172,11 @@ def doHost(lustreDB, hosts):
         for_each_profile(node_db, prof_list, doUnloadModules)
 
     else:
+        # ugly hack, only need to run lctl commands for --dump
+        if config.lctl_dump():
+            for_each_profile(node_db, prof_list, doSetup)
+            return
+
         for_each_profile(node_db, prof_list, doModules)
 
         sys_set_debug_path()
@@ -1954,7 +2202,7 @@ def parse_cmdline(argv):
                  "help", "node=", "nomod", "nosetup",
                  "dump=", "force", "minlevel=", "maxlevel=",
                  "timeout=", "recovery_upcall=",
-                 "ldapurl=", "config=", "select="]
+                 "ldapurl=", "config=", "select=", "lctl_dump="]
     opts = []
     args = []
 
@@ -1973,7 +2221,6 @@ def parse_cmdline(argv):
             config.verbose(1)
         if o in ("-n", "--noexec"):
             config.noexec(1)
-            config.verbose(1)
         if o == "--portals":
             config.portals_dir(a)
         if o == "--lustre":
@@ -2006,6 +2253,8 @@ def parse_cmdline(argv):
                 config.config_name(a)
         if o == "--select":
                 config.init_select(a)
+        if o == "--lctl_dump":
+            config.lctl_dump(a)
 
     return args
 
@@ -2115,7 +2364,7 @@ def sanitise_path():
 # Shutdown does steps in reverse
 #
 def main():
-    global TCP_ACCEPTOR, lctl, MAXTCPBUF
+    global  lctl, MAXTCPBUF
 
     host = socket.gethostname()
 
@@ -2165,19 +2414,13 @@ def main():
 
     setupModulePath(sys.argv[0])
 
-    TCP_ACCEPTOR = find_prog('acceptor')
-    if not TCP_ACCEPTOR:
-        if config.noexec():
-            TCP_ACCEPTOR = 'acceptor'
-            debug('! acceptor not found')
-        else:
-            panic('acceptor not found')
-
     lctl = LCTLInterface('lctl')
-
-    sys_make_devices()
-    sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
-    sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
+    if config.lctl_dump():
+        lctl.use_save_file(config.lctl_dump())
+    else:
+        sys_make_devices()
+        sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
+        sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
 
     doHost(db, node_list)
 
@@ -2192,4 +2435,4 @@ if __name__ == "__main__":
 
     if first_cleanup_error:
         sys.exit(first_cleanup_error)
-
+        
index fb81dd3..1efbd8c 100644 (file)
@@ -26,6 +26,7 @@
 
 #include <stdlib.h>
 #include <stdio.h>
+#include <portals/api-support.h>
 #include <portals/ptlctl.h>
 #include "obdctl.h"
 #include "parser.h"
@@ -58,6 +59,7 @@ command_t cmdlist[] = {
         {"ignore_errors", jt_opt_ignore_errors, 0,
          "ignore errors that occur during script processing\n"
          "ignore_errors"},
+        {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
 
         /* Network configuration commands */
         {"==== network config ====", jt_noop, 0, "network config"},
@@ -71,11 +73,11 @@ command_t cmdlist[] = {
          "The nid defaults to hostname for tcp networks and is automatically "
          "setup for elan/myrinet networks.\n"
          "usage: mynid [nid]"},
-        {"add_uuid", jt_ptl_add_uuid, 0, "associate a UUID with a nid\n"
-         "usage: add_uuid <uuid> <nid>"},
-        {"close_uuid", jt_ptl_close_uuid, 0, "disconnect a UUID\n"
+        {"add_uuid", jt_obd_add_uuid, 0, "associate a UUID with a nid\n"
+         "usage: add_uuid <uuid> <nid> <net_type>"},
+        {"close_uuid", jt_obd_close_uuid, 0, "disconnect a UUID\n"
          "usage: close_uuid <uuid>)"},
-        {"del_uuid", jt_ptl_del_uuid, 0, "delete a UUID association\n"
+        {"del_uuid", jt_obd_del_uuid, 0, "delete a UUID association\n"
          "usage: del_uuid <uuid>"},
         {"add_route", jt_ptl_add_route, 0,
          "add an entry to the routing table\n"
@@ -152,7 +154,7 @@ command_t cmdlist[] = {
         {"setattr", jt_obd_setattr, 0,
          "set mode attribute for OST object <objid>\n"
          "usage: setattr <objid> <mode>"},
-        {"create", jt_obd_create, 0,
+         {"create", jt_obd_create, 0,
          "create <num> OST objects (with <mode>)\n"
          "usage: create [num [mode [verbose]]]"},
         {"destroy", jt_obd_destroy, 0,
index 1b75135..aac0e16 100644 (file)
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/types.h>
+
+
+#include <liblustre.h>
+#include <linux/obd.h>
 #include <linux/lustre_lib.h>
 #include <linux/lustre_lite.h>
 #include <linux/obd_lov.h>
index 69290cb..76757a7 100755 (executable)
@@ -43,10 +43,13 @@ Object creation command summary:
 
 --add net
   --node node_name
-  --nid addr
+  --nid nid
   --nettype tcp|elan|toe|gm
+  --hostaddr addr
   --port port
   --tcpbuf size
+  --irq_affinity 0|1
+  --nid_exchange 0|1
   --router
 
 --add mds
@@ -114,12 +117,18 @@ def new_uuid(name):
 
 ldlm_name = 'ldlm'
 ldlm_uuid = 'ldlm_UUID'
+
+ptlrpc_name = 'RPCDEV'
+ptlrpc_uuid = 'RPCDEV_UUID'
+
 def new_lustre(dom):
     """Create a new empty lustre document"""
     # adding ldlm here is a bit of a hack, but one is enough.
     str = """<lustre>
     <ldlm name="%s" uuid="%s"/>
-    </lustre>""" % (ldlm_name, ldlm_uuid)
+    <ptlrpc name="%s" uuid="%s"/>
+    </lustre>""" % (ldlm_name, ldlm_uuid,
+                    ptlrpc_name, ptlrpc_uuid)
     return dom.parseString(str)
 
 names = {}
@@ -178,19 +187,30 @@ class GenConfig:
         node.appendChild(new)
         return new
 
-    def network(self, name, uuid, hostname, net, port=0, tcpbuf=0):
+    def network(self, name, uuid, nid, net, hostaddr="", port=0, tcpbuf=0, irq_aff=0, nid_xchg=0):
         """create <network> node"""
         network = self.newService("network", name, uuid)
         network.setAttribute("nettype", net);
-        self.addElement(network, "nid", hostname)
+        self.addElement(network, "nid", nid)
+        if hostaddr:
+            self.addElement(network, "hostaddr", hostaddr)
         if port:
             self.addElement(network, "port", "%d" %(port))
         if tcpbuf:
             self.addElement(network, "sendmem", "%d" %(tcpbuf))
             self.addElement(network, "recvmem", "%d" %(tcpbuf))
+        if irq_aff:
+            self.addElement(network, "irqaffinity", "%d" %(irq_aff))
+        if nid_xchg:
+            self.addElement(network, "nidexchange", "%d" %(nid_xchg))
             
         return network
 
+    def routetbl(self, name, uuid):
+        """create <routetbl> node"""
+        rtbl = self.newService("routetbl", name, uuid)
+        return rtbl
+        
     def route(self, net_type, gw, lo, hi):
         """ create one entry for the route table """
         ref = self.doc.createElement('route')
@@ -217,11 +237,11 @@ class GenConfig:
         ldlm = self.newService("ldlm", name, uuid)
         return ldlm
 
-    def osd(self, name, uuid, fs, osdtype, devname, format, ost_uuid, net_uuid, dev_size=0):
+    def osd(self, name, uuid, fs, osdtype, devname, format, ost_uuid, node_uuid, dev_size=0):
         osd = self.newService("osd", name, uuid)
         osd.setAttribute('osdtype', osdtype)
         osd.appendChild(self.ref("target", ost_uuid))
-        osd.appendChild(self.ref("network", net_uuid))
+        osd.appendChild(self.ref("node", node_uuid))
         if fs:
             self.addElement(osd, "fstype", fs)
         if devname:
@@ -264,7 +284,7 @@ class GenConfig:
         mds.appendChild(self.ref("active",mdd_uuid))
         return mds
 
-    def mdsdev(self, name, uuid, fs, devname, format, net_uuid, node_uuid,
+    def mdsdev(self, name, uuid, fs, devname, format, node_uuid,
             mds_uuid, dev_size=0 ):
         mdd = self.newService("mdsdev", name, uuid)
         self.addElement(mdd, "fstype", fs)
@@ -272,7 +292,7 @@ class GenConfig:
         self.addElement(mdd, "autoformat", format)
         if dev_size:
                 self.addElement(mdd, "devsize", "%s" % (dev_size))
-        mdd.appendChild(self.ref("network", net_uuid))
+        mdd.appendChild(self.ref("node", node_uuid))
         mdd.appendChild(self.ref("target", mds_uuid))
         return mdd
 
@@ -393,6 +413,7 @@ def do_add_node(gen, lustre,  options, node_name):
     lustre.appendChild(profile)
 
     node_add_profile(gen, node, 'ldlm', ldlm_uuid)
+    node_add_profile(gen, node, 'ptlrpc', ptlrpc_uuid)
     if has_option(options, 'router'):
         node.setAttribute('router', '1')
     if has_option(options, 'timeout'):
@@ -418,14 +439,19 @@ def add_net(gen, lustre, options):
 
     node_name = get_option(options, 'node')
     nid = get_option(options, 'nid')
+    hostaddr = get_option(options, 'hostaddr', '')
     net_type = get_option(options, 'nettype')
 
     if net_type in ('tcp', 'toe'):
         port = get_option_int(options, 'port', DEFAULT_PORT)
         tcpbuf = get_option_int(options, 'tcpbuf', 0)
+        irq_aff = get_option_int(options, 'irq_affinity', 0)
+        nid_xchg = get_option_int(options, 'nid_exchange', 0)
     elif net_type in ('elan', 'gm'):
         port = 0
         tcpbuf = 0
+        irq_aff = 0
+        nid_xchg = 0
     else:
         print "Unknown net_type: ", net_type
         sys.exit(2)
@@ -437,7 +463,7 @@ def add_net(gen, lustre, options):
         node = ret
     net_name = new_name('NET_'+ node_name +'_'+ net_type)
     net_uuid = new_uuid(net_name)
-    node.appendChild(gen.network(net_name, net_uuid, nid, net_type, port, tcpbuf))
+    node.appendChild(gen.network(net_name, net_uuid, nid, net_type, hostaddr, port, tcpbuf, irq_aff, nid_xchg))
     node_add_profile(gen, node, "network", net_uuid)
 
 
@@ -454,13 +480,15 @@ def add_route(gen, lustre, options):
     if not node:
         error (node_name, " not found.")
     
-    netlist = node.getElementsByTagName('network')
-    net = netlist[0]
-    rlist = net.getElementsByTagName('routetbl')
+    rlist = node.getElementsByTagName('routetbl')
     if len(rlist) > 0:
         rtbl = rlist[0]
     else:
-        rtbl = gen.addElement(net, 'routetbl')
+        rtbl_name = new_name("RTBL_" + node_name)
+        rtbl_uuid = new_uuid(rtbl_name)
+        rtbl = gen.routetbl(rtbl_name, rtbl_uuid)
+        node.appendChild(rtbl)
+        node_add_profile(gen, node, "routetbl", rtbl_uuid)
     rtbl.appendChild(gen.route(net_type, gw, lo, hi))
 
 
@@ -489,7 +517,7 @@ def add_mds(gen, lustre, options):
         error("NODE: ", node_name, "not found")
 
     mdd = gen.mdsdev(mdd_name, mdd_uuid, fstype, devname, get_format_flag(options),
-                  net_uuid, node_uuid, mds_uuid, dev_size=size)
+                  node_uuid, mds_uuid, dev_size=size)
     lustre.appendChild(mdd)
                    
 
@@ -498,6 +526,8 @@ def add_ost(gen, lustre, options):
     lovname = get_option(options, 'lov', '')
     osdtype = get_option(options, 'osdtype', 'obdfilter', deprecated_tag="obdtype")
 
+    node_uuid = name2uuid(lustre, node_name)
+
     if osdtype == 'obdecho':
         fstype = ''
         devname = ''
@@ -532,12 +562,8 @@ def add_ost(gen, lustre, options):
                 error('add_ost:', '"'+lovname+'"', "lov element not found.")
             lov_add_obd(gen, lov, ost_uuid)
 
-    net_uuid = get_net_uuid(lustre, node_name)
-    if not net_uuid:
-        error("NODE: No net network interface for", node_name, "found")
-    
     osd = gen.osd(osdname, osd_uuid, fstype, osdtype, devname, get_format_flag(options), ost_uuid,
-                  net_uuid, size)
+                  node_uuid, size)
 
     node = findByName(lustre, node_name, "node")
 
@@ -705,7 +731,8 @@ def parse_cmdline(argv):
                  "dev=", "size=", "obd=", "ost=", "obdtype=", "osdtype=", "obduuid=", "in=",
                  "ostuuid=", "path=", "help", "batch=", "lov=", "gw=", "lo=", "hi=",
                  "osc=", "real_obd=", "cache_obd=", "fstype=",
-                 "timeout=", "recovery_upcall="]
+                 "timeout=", "recovery_upcall=", "nid_exchange=", "irq_affinity=",
+                 "hostaddr=",]
     opts = []
     args = []
     options = {}
@@ -743,6 +770,8 @@ def parse_cmdline(argv):
         # network options
         if o == "--nid":
             options['nid'] = a
+        if o == "--hostaddr":
+            options['hostaddr'] = a
         if o == "--nettype":
             options['nettype'] = a
         if o == "--net":
@@ -755,6 +784,10 @@ def parse_cmdline(argv):
             options['mtpt'] = 1
         if o == "--route":
             options['route'] = 1
+        if o == "--nid_exchange":
+            options['nid_exchange'] = a
+        if o == "--irq_affinity":
+            options['irq_affinity'] = a
 
         # ost options
         if o == "--dev":
index 8800b57..c4ecc42 100644 (file)
 #include <stdarg.h>
 #include <signal.h>
 
+#ifndef __KERNEL__
+#include <liblustre.h>
+#endif
 #include <linux/lustre_lib.h>
 #include <linux/lustre_idl.h>
 #include <linux/lustre_dlm.h>
-#include <linux/obd_lov.h>      /* for IOC_LOV_SET_OSC_ACTIVE */
 #include <linux/obd.h>          /* for struct lov_stripe_md */
+#include <linux/obd_lov.h>      /* for IOC_LOV_SET_OSC_ACTIVE */
 #include <linux/lustre_build_version.h>
 
 #include <unistd.h>
@@ -58,6 +61,7 @@
 #undef __KERNEL__
 
 #include "obdctl.h"
+#include <portals/ptlctl.h>
 #include "parser.h"
 #include <stdio.h>
 
@@ -72,7 +76,6 @@ static long long counter_snapshot[2][MAX_SHMEM_COUNT];
 struct timeval prev_time;
 #endif
 
-int fd = -1;
 uint64_t conn_addr = -1;
 uint64_t conn_cookie;
 char rawbuf[8192];
@@ -86,24 +89,18 @@ union lsm_buffer {
         struct lov_stripe_md lsm;
 } lsm_buffer;
 
-static int getfd(char *func);
 static char *cmdname(char *func);
 
-#define IOCINIT(data)                                                   \
+#define IOC_INIT(data)                                                  \
 do {                                                                    \
         memset(&data, 0, sizeof(data));                                 \
-        data.ioc_version = OBD_IOCTL_VERSION;                           \
         data.ioc_addr = conn_addr;                                      \
         data.ioc_cookie = conn_cookie;                                  \
-        data.ioc_len = sizeof(data);                                    \
-        if (fd < 0) {                                                   \
-                fprintf(stderr, "No device open, use device\n");        \
-                return 1;                                               \
-        }                                                               \
 } while (0)
 
 #define IOC_PACK(func, data)                                            \
 do {                                                                    \
+        memset(buf, 0, sizeof(rawbuf));                                 \
         if (obd_ioctl_pack(&data, &buf, max)) {                         \
                 fprintf(stderr, "error: %s: invalid ioctl\n",           \
                         cmdname(func));                                 \
@@ -145,16 +142,13 @@ static int do_name2dev(char *func, char *name)
         struct obd_ioctl_data data;
         int rc;
 
-        if (getfd(func))
-                return -1;
-
-        IOCINIT(data);
+        IOC_INIT(data);
 
         data.ioc_inllen1 = strlen(name) + 1;
         data.ioc_inlbuf1 = name;
 
         IOC_PACK(func, data);
-        rc = ioctl(fd, OBD_IOC_NAME2DEV, buf);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_NAME2DEV, buf);
         if (rc < 0) {
                 fprintf(stderr, "error: %s: %s - %s\n", cmdname(func),
                         name, strerror(rc = errno));
@@ -203,13 +197,13 @@ lsm_string (struct lov_stripe_md *lsm)
 
         *p = 0;
         space--;
-        
+
         nob = snprintf(p, space, LPX64, lsm->lsm_object_id);
         p += nob;
         space -= nob;
-        
+
         if (lsm->lsm_stripe_count != 0) {
-                nob = snprintf (p, space, "=%u#%u@%d", 
+                nob = snprintf (p, space, "=%u#%u@%d",
                                 lsm->lsm_stripe_size,
                                 lsm->lsm_stripe_count,
                                 lsm->lsm_stripe_offset);
@@ -228,52 +222,51 @@ lsm_string (struct lov_stripe_md *lsm)
                 fprintf (stderr, "lsm_string() overflowed buffer\n");
                 abort ();
         }
-        
+
         return (buffer);
 }
 
 static void
-reset_lsmb (union lsm_buffer *lsmb) 
+reset_lsmb (union lsm_buffer *lsmb)
 {
         memset (lsmb->space, 0, sizeof (lsmb->space));
         lsmb->lsm.lsm_magic = LOV_MAGIC;
-        
 }
 
-static int 
+static int
 parse_lsm (union lsm_buffer *lsmb, char *string)
 {
         struct lov_stripe_md *lsm = &lsmb->lsm;
         char                 *end;
         int                   i;
-        
+
         /*
-         * object_id[=size#count[@offset][:id]*] 
+         * object_id[=size#count[@offset][:id]*]
          */
 
         reset_lsmb (lsmb);
-        
+
         lsm->lsm_object_id = strtoull (string, &end, 0);
         if (end == string)
                 return (-1);
         string = end;
-        
+
         if (*string == 0)
                 return (0);
 
         if (*string != '=')
                 return (-1);
         string++;
-        
+
         lsm->lsm_stripe_size = strtoul (string, &end, 0);
         if (end == string)
                 return (-1);
         string = end;
-        
+
         if (*string != '#')
                 return (-1);
         string++;
-        
+
         lsm->lsm_stripe_count = strtoul (string, &end, 0);
         if (end == string)
                 return (-1);
@@ -286,10 +279,10 @@ parse_lsm (union lsm_buffer *lsmb, char *string)
                         return (-1);
                 string = end;
         }
-        
-        if (*string == 0)                       /* don't have to specify obj ids */
+
+        if (*string == 0)               /* don't have to specify obj ids */
                 return (0);
-        
+
         for (i = 0; i < lsm->lsm_stripe_count; i++) {
                 if (*string != ':')
                         return (-1);
@@ -300,7 +293,7 @@ parse_lsm (union lsm_buffer *lsmb, char *string)
 
         if (*string != 0)
                 return (-1);
-        
+
         return (0);
 }
 
@@ -316,19 +309,6 @@ static char *cmdname(char *func)
         return func;
 }
 
-static int getfd(char *func)
-{
-        if (fd == -1)
-                fd = open("/dev/obd", O_RDWR);
-        if (fd == -1) {
-                fprintf(stderr, "error: %s: opening /dev/obd: %s\n"
-                        "hint: lustre kernel modules may not be loaded.\n",
-                        cmdname(func), strerror(errno));
-                return -1;
-        }
-        return 0;
-}
-
 #define difftime(a, b)                                          \
         ((double)(a)->tv_sec - (b)->tv_sec +                    \
          ((double)((a)->tv_usec - (b)->tv_usec) / 1000000))
@@ -403,11 +383,12 @@ int do_disconnect(char *func, int verbose)
         if (conn_addr == -1)
                 return 0;
 
-        IOCINIT(data);
+        IOC_INIT(data);
 
-        rc = ioctl(fd, OBD_IOC_DISCONNECT, &data);
+        IOC_PACK(func, data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_DISCONNECT, buf);
         if (rc < 0) {
-                fprintf(stderr, "error: %s: OPD_IOC_DISCONNECT %s\n", 
+                fprintf(stderr, "error: %s: OPD_IOC_DISCONNECT %s\n",
                         cmdname(func),strerror(errno));
         } else {
                 if (verbose)
@@ -503,10 +484,10 @@ static void shmem_snap(int n)
         prev_time = this_time;
 }
 
-#define SHMEM_SETUP()  shmem_setup()
-#define SHMEM_RESET()  shmem_reset()
-#define SHMEM_BUMP()   shmem_bump()
-#define SHMEM_SNAP(n)  shmem_snap(n)
+#define SHMEM_SETUP()   shmem_setup()
+#define SHMEM_RESET()   shmem_reset()
+#define SHMEM_BUMP()    shmem_bump()
+#define SHMEM_SNAP(n)   shmem_snap(n)
 #else
 #define SHMEM_SETUP()
 #define SHMEM_RESET()
@@ -524,11 +505,8 @@ static int do_device(char *func, int dev)
 
         data.ioc_dev = dev;
 
-        if (getfd(func))
-                return -1;
-
         IOC_PACK(func, data);
-        return ioctl(fd, OBD_IOC_DEVICE, buf);
+        return l_ioctl(OBD_DEV_ID, OBD_IOC_DEVICE, buf);
 }
 
 int jt_obd_device(int argc, char **argv)
@@ -556,7 +534,7 @@ int jt_obd_connect(int argc, char **argv)
         struct obd_ioctl_data data;
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
 
         do_disconnect(argv[0], 1);
 
@@ -564,7 +542,9 @@ int jt_obd_connect(int argc, char **argv)
         if (argc != 1)
                 return CMD_HELP;
 
-        rc = ioctl(fd, OBD_IOC_CONNECT, &data);
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_CONNECT, buf);
+        IOC_UNPACK(argv[0], data);
         if (rc < 0)
                 fprintf(stderr, "error: %s: OBD_IOC_CONNECT %s\n",
                         cmdname(argv[0]), strerror(rc = errno));
@@ -707,12 +687,13 @@ int jt_obd_detach(int argc, char **argv)
         struct obd_ioctl_data data;
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
 
         if (argc != 1)
                 return CMD_HELP;
 
-        rc = ioctl(fd, OBD_IOC_DETACH, buf);
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_DETACH, buf);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
@@ -726,7 +707,7 @@ int jt_obd_cleanup(int argc, char **argv)
         char force = 'F';
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
 
         if (argc != 1 && argc != 2)
                 return CMD_HELP;
@@ -739,7 +720,7 @@ int jt_obd_cleanup(int argc, char **argv)
         }
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, OBD_IOC_CLEANUP, buf);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_CLEANUP, buf);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
@@ -752,12 +733,13 @@ int jt_obd_no_transno(int argc, char **argv)
         struct obd_ioctl_data data;
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
 
         if (argc != 1)
                 return CMD_HELP;
 
-        rc = ioctl(fd, OBD_IOC_NO_TRANSNO, &data);
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_NO_TRANSNO, buf);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
@@ -770,12 +752,13 @@ int jt_obd_set_readonly(int argc, char **argv)
         struct obd_ioctl_data data;
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
 
         if (argc != 1)
                 return CMD_HELP;
 
-        rc = ioctl(fd, OBD_IOC_SET_READONLY, &data);
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_SET_READONLY, buf);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
@@ -788,19 +771,18 @@ int jt_obd_newdev(int argc, char **argv)
         int rc;
         struct obd_ioctl_data data;
 
-        if (getfd(argv[0]))
-                return -1;
-
-        IOCINIT(data);
+        IOC_INIT(data);
 
         if (argc != 1)
                 return CMD_HELP;
 
-        rc = ioctl(fd, OBD_IOC_NEWDEV, &data);
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_NEWDEV, buf);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
         else {
+                IOC_UNPACK(argv[0], data);
                 printf("Current device set to %d\n", data.ioc_dev);
         }
 
@@ -813,28 +795,25 @@ int jt_get_version(int argc, char **argv)
         char buf[8192];
         struct obd_ioctl_data *data = (struct obd_ioctl_data *)buf;
 
-        if (getfd(argv[0]))
-                return -1;
+        if (argc != 1)
+                return CMD_HELP;
 
         memset(buf, 0, sizeof(buf));
         data->ioc_version = OBD_IOCTL_VERSION;
         data->ioc_addr = conn_addr;
         data->ioc_cookie = conn_addr;
-        data->ioc_len = sizeof(buf);
         data->ioc_inllen1 = sizeof(buf) - size_round(sizeof(*data));
+        data->ioc_len = obd_ioctl_packlen(data);
 
-        if (argc != 1)
-                return CMD_HELP;
-
-        rc = ioctl(fd, OBD_GET_VERSION, data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_GET_VERSION, buf);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
         else {
                 printf("Lustre version: %s\n", data->ioc_bulk);
         }
-        
-        printf("lctl version: %s\n",BUILD_VERSION);
+
+        printf("lctl   version: %s\n", BUILD_VERSION);
         return rc;
 }
 
@@ -844,20 +823,17 @@ int jt_obd_list(int argc, char **argv)
         char buf[8192];
         struct obd_ioctl_data *data = (struct obd_ioctl_data *)buf;
 
-        if (getfd(argv[0]))
-                return -1;
+        if (argc != 1)
+                return CMD_HELP;
 
         memset(buf, 0, sizeof(buf));
         data->ioc_version = OBD_IOCTL_VERSION;
         data->ioc_addr = conn_addr;
         data->ioc_cookie = conn_addr;
-        data->ioc_len = sizeof(buf);
         data->ioc_inllen1 = sizeof(buf) - size_round(sizeof(*data));
+        data->ioc_len = obd_ioctl_packlen(data);
 
-        if (argc != 1)
-                return CMD_HELP;
-
-        rc = ioctl(fd, OBD_IOC_LIST, data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LIST, data);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
@@ -873,7 +849,7 @@ int jt_obd_attach(int argc, char **argv)
         struct obd_ioctl_data data;
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
 
         if (argc != 2 && argc != 3 && argc != 4)
                 return CMD_HELP;
@@ -891,9 +867,9 @@ int jt_obd_attach(int argc, char **argv)
         }
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, OBD_IOC_ATTACH, buf);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_ATTACH, buf);
         if (rc < 0)
-                fprintf(stderr, "error: %s: OBD_IOC_ATTACH %s\n", 
+                fprintf(stderr, "error: %s: OBD_IOC_ATTACH %s\n",
                         cmdname(argv[0]), strerror(rc = errno));
         else if (argc == 3) {
                 char name[1024];
@@ -933,9 +909,9 @@ int jt_obd_setup(int argc, char **argv)
         struct obd_ioctl_data data;
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
 
-        if (argc > 3)
+        if (argc > 4)
                 return CMD_HELP;
 
         data.ioc_dev = -1;
@@ -946,13 +922,17 @@ int jt_obd_setup(int argc, char **argv)
                 data.ioc_inllen1 = strlen(argv[1]) + 1;
                 data.ioc_inlbuf1 = argv[1];
         }
-        if (argc == 3) {
+        if (argc > 2) {
                 data.ioc_inllen2 = strlen(argv[2]) + 1;
                 data.ioc_inlbuf2 = argv[2];
         }
+        if (argc > 3) {
+                data.ioc_inllen3 = strlen(argv[3]) + 1;
+                data.ioc_inlbuf3 = argv[3];
+        }
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, OBD_IOC_SETUP, buf);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_SETUP, buf);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
@@ -968,7 +948,7 @@ int jt_obd_get_stripe (int argc, char **argv)
         __u64 id;
         int   rc;
         char *end;
-        
+
         if (argc != 2)
                 return (CMD_HELP);
 
@@ -980,8 +960,8 @@ int jt_obd_get_stripe (int argc, char **argv)
         }
 
         memset (&lsm_buffer, 0, sizeof (lsm_buffer));
-        
-        IOCINIT (data);
+
+        IOC_INIT (data);
         data.ioc_obdo1.o_id = id;
         data.ioc_obdo1.o_mode = S_IFREG | 0644;
         data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE;
@@ -989,17 +969,17 @@ int jt_obd_get_stripe (int argc, char **argv)
         data.ioc_plen1 = sizeof (lsm_buffer);
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, ECHO_IOC_GET_STRIPE, buf);
+        rc = l_ioctl(OBD_DEV_ID, ECHO_IOC_GET_STRIPE, buf);
         IOC_UNPACK(argv[0], data);
 
         if (rc != 0) {
-                fprintf (stderr, "Error: %s: rc %d(%s)\n", 
+                fprintf (stderr, "Error: %s: rc %d(%s)\n",
                          cmdname (argv[0]), rc, strerror (errno));
                 return (rc);
         }
-        
+
         printf ("%s\n", lsm_string (&lsm_buffer.lsm));
-        
+
         return (rc);
 }
 
@@ -1033,26 +1013,25 @@ int jt_obd_set_stripe (int argc, char **argv)
                 }
         }
 
-        for (i = 0; i < count; i++) 
-        {
-                IOCINIT (data);
+        for (i = 0; i < count; i++) {
+                IOC_INIT (data);
                 data.ioc_obdo1.o_id = lsm_buffer.lsm.lsm_object_id + i;
                 data.ioc_obdo1.o_mode = S_IFREG | 0644;
                 data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE;
                 data.ioc_pbuf1 = (char *)&lsm_buffer;
                 data.ioc_plen1 = sizeof (lsm_buffer);
-                
+
                 IOC_PACK (argv[0], data);
-                rc = ioctl (fd, ECHO_IOC_SET_STRIPE, buf);
+                rc = l_ioctl (OBD_DEV_ID, ECHO_IOC_SET_STRIPE, buf);
                 IOC_UNPACK (argv[0], data);
-                
+
                 if (rc != 0) {
-                        fprintf (stderr, "Error: %s: rc %d(%s)\n", 
+                        fprintf (stderr, "Error: %s: rc %d(%s)\n",
                                  cmdname (argv[0]), rc, strerror (errno));
                         return (rc);
                 }
         }
-        
+
         return (0);
 }
 
@@ -1067,25 +1046,25 @@ int jt_obd_unset_stripe (int argc, char **argv)
 
         if (argc != 2)
                 return CMD_HELP;
-        
+
         id = strtoll (argv[1], &end, 0);
         if (*end == 0) {
                 fprintf (stderr, "error: %s: invalid object id '%s'\n",
                          cmdname (argv[0]), argv[1]);
                 return CMD_HELP;
         }
-                
-        IOCINIT (data);
+
+        IOC_INIT (data);
         data.ioc_obdo1.o_id = lsm_buffer.lsm.lsm_object_id;
         data.ioc_obdo1.o_mode = S_IFREG | 0644;
         data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE;
-        
+
         IOC_PACK (argv[0], data);
-        rc = ioctl (fd, ECHO_IOC_SET_STRIPE, buf);
+        rc = l_ioctl (OBD_DEV_ID, ECHO_IOC_SET_STRIPE, buf);
         IOC_UNPACK (argv[0], data);
-        
+
         if (rc != 0)
-                fprintf (stderr, "Error: %s: rc %d(%s)\n", 
+                fprintf (stderr, "Error: %s: rc %d(%s)\n",
                          cmdname (argv[0]), rc, strerror (errno));
 
         return (0);
@@ -1106,7 +1085,7 @@ int jt_obd_create(int argc, char **argv)
         int verbose = 1, mode = 0100644, rc = 0, i;
         char *end;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         if (argc < 2 || argc > 5)
                 return CMD_HELP;
 
@@ -1145,7 +1124,7 @@ int jt_obd_create(int argc, char **argv)
                 }
                 base_id = lsm_buffer.lsm.lsm_object_id;
         }
-                
+
         printf("%s: "LPD64" objects\n", cmdname(argv[0]), count);
         gettimeofday(&next_time, NULL);
         next_time.tv_sec -= verbose;
@@ -1162,7 +1141,7 @@ int jt_obd_create(int argc, char **argv)
                 data.ioc_pbuf1 = (char *)&lsm_buffer;
 
                 IOC_PACK(argv[0], data);
-                rc = ioctl(fd, OBD_IOC_CREATE, buf);
+                rc = l_ioctl(OBD_DEV_ID, OBD_IOC_CREATE, buf);
                 IOC_UNPACK(argv[0], data);
                 SHMEM_BUMP();
                 if (rc < 0) {
@@ -1179,7 +1158,7 @@ int jt_obd_create(int argc, char **argv)
 
                 if (be_verbose(verbose, &next_time, i, &next_count, count))
                         printf("%s: #%d is object id "LPX64"\n",
-                               cmdname(argv[0]), i, data.ioc_obdo1.o_id);
+                                cmdname(argv[0]), i, data.ioc_obdo1.o_id);
         }
         return rc;
 }
@@ -1190,7 +1169,7 @@ int jt_obd_setattr(int argc, char **argv)
         char *end;
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         if (argc != 2)
                 return CMD_HELP;
 
@@ -1209,7 +1188,7 @@ int jt_obd_setattr(int argc, char **argv)
         data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE;
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, OBD_IOC_SETATTR, buf);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_SETATTR, buf);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
@@ -1227,7 +1206,7 @@ int jt_obd_destroy(int argc, char **argv)
         char *end;
         int rc = 0, i;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         if (argc < 2 || argc > 4)
                 return CMD_HELP;
 
@@ -1263,7 +1242,7 @@ int jt_obd_destroy(int argc, char **argv)
                 data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE;
 
                 IOC_PACK(argv[0], data);
-                rc = ioctl(fd, OBD_IOC_DESTROY, buf);
+                rc = l_ioctl(OBD_DEV_ID, OBD_IOC_DESTROY, buf);
                 IOC_UNPACK(argv[0], data);
                 SHMEM_BUMP();
                 if (rc < 0) {
@@ -1289,7 +1268,7 @@ int jt_obd_getattr(int argc, char **argv)
         if (argc != 2)
                 return CMD_HELP;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         data.ioc_obdo1.o_id = strtoull(argv[1], &end, 0);
         if (*end) {
                 fprintf(stderr, "error: %s: invalid objid '%s'\n",
@@ -1302,7 +1281,7 @@ int jt_obd_getattr(int argc, char **argv)
         printf("%s: object id "LPX64"\n", cmdname(argv[0]),data.ioc_obdo1.o_id);
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, OBD_IOC_GETATTR, buf);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_GETATTR, buf);
         IOC_UNPACK(argv[0], data);
         if (rc) {
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
@@ -1327,7 +1306,7 @@ int jt_obd_test_getattr(int argc, char **argv)
         if (argc < 2 && argc > 4)
                 return CMD_HELP;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         count = strtoull(argv[1], &end, 0);
         if (*end) {
                 fprintf(stderr, "error: %s: invalid iteration count '%s'\n",
@@ -1366,7 +1345,8 @@ int jt_obd_test_getattr(int argc, char **argv)
                 data.ioc_obdo1.o_id = objid;
                 data.ioc_obdo1.o_mode = S_IFREG;
                 data.ioc_obdo1.o_valid = 0xffffffff;
-                rc = ioctl(fd, OBD_IOC_GETATTR, &data);
+                IOC_PACK(argv[0], data);
+                rc = l_ioctl(OBD_DEV_ID, OBD_IOC_GETATTR, &data);
                 SHMEM_BUMP();
                 if (rc < 0) {
                         fprintf(stderr, "error: %s: #"LPD64" - %d:%s\n",
@@ -1468,7 +1448,7 @@ int jt_obd_test_brw(int argc, char **argv)
 
         len = pages * PAGE_SIZE;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         data.ioc_obdo1.o_id = objid;
         data.ioc_obdo1.o_mode = S_IFREG;
         data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE;
@@ -1485,7 +1465,7 @@ int jt_obd_test_brw(int argc, char **argv)
                        pages, objid, data.ioc_offset, ctime(&start.tv_sec));
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, OBD_IOC_OPEN, buf);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_OPEN, buf);
         IOC_UNPACK(argv[0], data);
         if (rc) {
                 fprintf(stderr, "error: brw_open: %s\n", strerror(rc = errno));
@@ -1494,7 +1474,7 @@ int jt_obd_test_brw(int argc, char **argv)
 
         rw = write ? OBD_IOC_BRW_WRITE : OBD_IOC_BRW_READ;
         for (i = 1, next_count = verbose; i <= count; i++) {
-                rc = ioctl(fd, rw, buf);
+                rc = l_ioctl(OBD_DEV_ID, rw, buf);
                 SHMEM_BUMP();
                 if (rc) {
                         fprintf(stderr, "error: %s: #%d - %s on %s\n",
@@ -1523,7 +1503,7 @@ int jt_obd_test_brw(int argc, char **argv)
                                i, pages, diff, (double)i * pages / diff,
                                ctime(&end.tv_sec));
         }
-        rw = ioctl(fd, OBD_IOC_CLOSE, buf);
+        rw = l_ioctl(OBD_DEV_ID, OBD_IOC_CLOSE, buf);
         if (rw) {
                 fprintf(stderr, "error: brw_close: %s\n", strerror(rw = errno));
                 if (!rc)
@@ -1541,7 +1521,7 @@ int jt_obd_lov_setconfig(int argc, char **argv)
         int rc, i;
         char *end;
 
-        IOCINIT(data);
+        IOC_INIT(data);
 
         if (argc <= 6)
                 return CMD_HELP;
@@ -1634,7 +1614,7 @@ int jt_obd_lov_setconfig(int argc, char **argv)
                 rc = -EINVAL;
                 goto out;
         }
-        rc = ioctl(fd, OBD_IOC_LOV_SET_CONFIG, buf);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LOV_SET_CONFIG, buf);
         if (rc)
                 fprintf(stderr, "error: %s: ioctl error: %s\n",
                         cmdname(argv[0]), strerror(rc = errno));
@@ -1651,20 +1631,16 @@ int jt_obd_lov_getconfig(int argc, char **argv)
         struct lov_desc desc;
         struct obd_uuid *uuidarray;
         char *path;
-        int rc, tmpfd;
+        int rc, fd;
 
-        /* FIXME: ug.  IOCINIT checks fd. */
-        tmpfd = fd;
-        fd = 1;
-        IOCINIT(data);
-        fd = tmpfd;        
+        IOC_INIT(data);
 
         if (argc != 2)
                 return CMD_HELP;
 
         path = argv[1];
-        tmpfd = open(path, O_RDONLY);
-        if (tmpfd < 0) {
+        fd = open(path, O_RDONLY);
+        if (fd < 0) {
                 fprintf(stderr, "open \"%s\" failed: %s\n", path,
                         strerror(errno));
                 return -1;
@@ -1692,7 +1668,7 @@ repeat:
                 rc = -EINVAL;
                 goto out;
         }
-        rc = ioctl(tmpfd, OBD_IOC_LOV_GET_CONFIG, buf);
+        rc = ioctl(fd, OBD_IOC_LOV_GET_CONFIG, buf);
         if (rc == -ENOSPC) {
                 free(uuidarray);
                 goto repeat;
@@ -1722,7 +1698,7 @@ repeat:
         }
 out:
         free(uuidarray);
-        close(tmpfd);
+        close(fd);
         return rc;
 }
 
@@ -1731,11 +1707,12 @@ int jt_obd_test_ldlm(int argc, char **argv)
         struct obd_ioctl_data data;
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         if (argc != 1)
                 return CMD_HELP;
 
-        rc = ioctl(fd, IOC_LDLM_TEST, &data);
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, IOC_LDLM_TEST, buf);
         if (rc)
                 fprintf(stderr, "error: %s: test failed: %s\n",
                         cmdname(argv[0]), strerror(rc = errno));
@@ -1747,11 +1724,12 @@ int jt_obd_dump_ldlm(int argc, char **argv)
         struct obd_ioctl_data data;
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         if (argc != 1)
                 return CMD_HELP;
 
-        rc = ioctl(fd, IOC_LDLM_DUMP, &data);
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, IOC_LDLM_DUMP, buf);
         if (rc)
                 fprintf(stderr, "error: %s failed: %s\n",
                         cmdname(argv[0]), strerror(rc = errno));
@@ -1765,7 +1743,7 @@ int jt_obd_ldlm_regress_start(int argc, char **argv)
         char argstring[200];
         int i, count = sizeof(argstring) - 1;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         if (argc > 5)
                 return CMD_HELP;
 
@@ -1783,7 +1761,7 @@ int jt_obd_ldlm_regress_start(int argc, char **argv)
         }
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, IOC_LDLM_REGRESS_START, buf);
+        rc = l_ioctl(OBD_DEV_ID, IOC_LDLM_REGRESS_START, buf);
         if (rc)
                 fprintf(stderr, "error: %s: test failed: %s\n",
                         cmdname(argv[0]), strerror(rc = errno));
@@ -1795,12 +1773,13 @@ int jt_obd_ldlm_regress_stop(int argc, char **argv)
 {
         int rc;
         struct obd_ioctl_data data;
-        IOCINIT(data);
+        IOC_INIT(data);
 
         if (argc != 1)
                 return CMD_HELP;
 
-        rc = ioctl(fd, IOC_LDLM_REGRESS_STOP, &data);
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, IOC_LDLM_REGRESS_STOP, buf);
 
         if (rc)
                 fprintf(stderr, "error: %s: test failed: %s\n",
@@ -1813,7 +1792,7 @@ int jt_obd_lov_set_osc_active(int argc, char **argv)
         struct obd_ioctl_data data;
         int rc;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         if (argc != 3)
                 return CMD_HELP;
 
@@ -1824,7 +1803,7 @@ int jt_obd_lov_set_osc_active(int argc, char **argv)
         data.ioc_offset = atoi(argv[2]);
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, IOC_LOV_SET_OSC_ACTIVE, buf);
+        rc = l_ioctl(OBD_DEV_ID, IOC_LOV_SET_OSC_ACTIVE, buf);
         if (rc)
                 fprintf(stderr, "error: %s: failed: %s\n",
                         cmdname(argv[0]), strerror(rc = errno));
@@ -1837,7 +1816,7 @@ int jt_obd_newconn(int argc, char **argv)
         int rc;
         struct obd_ioctl_data data;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         if (argc < 2 || argc > 3)
                 return CMD_HELP;
 
@@ -1850,7 +1829,7 @@ int jt_obd_newconn(int argc, char **argv)
         }
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, OBD_IOC_RECOVD_NEWCONN, buf);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_RECOVD_NEWCONN, buf);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
@@ -1863,7 +1842,7 @@ int jt_obd_failconn(int argc, char **argv)
         int rc;
         struct obd_ioctl_data data;
 
-        IOCINIT(data);
+        IOC_INIT(data);
         if (argc < 2)
                 return CMD_HELP;
 
@@ -1871,11 +1850,11 @@ int jt_obd_failconn(int argc, char **argv)
         data.ioc_inlbuf1 = argv[1];
 
         IOC_PACK(argv[0], data);
-        rc = ioctl(fd, OBD_IOC_RECOVD_FAILCONN, buf);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_RECOVD_FAILCONN, buf);
         if (rc < 0)
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
-        
+
         return rc;
 }
 
@@ -1883,7 +1862,7 @@ int jt_obd_mdc_lookup(int argc, char **argv)
 {
         struct obd_ioctl_data data;
         char *parent, *child;
-        int rc, tmpfd, verbose = 1;
+        int rc, fd, verbose = 1;
 
         if (argc < 3 || argc > 4)
                 return CMD_HELP;
@@ -1893,30 +1872,26 @@ int jt_obd_mdc_lookup(int argc, char **argv)
         if (argc == 4)
                 verbose = get_verbose(argv[0], argv[3]);
 
-        /* FIXME: ug.  IOCINIT checks fd. */
-        tmpfd = fd;
-        fd = 1;
-        IOCINIT(data);
-        fd = tmpfd;        
+        IOC_INIT(data);
 
         data.ioc_inllen1 = strlen(child) + 1;
         data.ioc_inlbuf1 = child;
 
         IOC_PACK(argv[0], data);
 
-        tmpfd = open(parent, O_RDONLY);
-        if (tmpfd < 0) {
+        fd = open(parent, O_RDONLY);
+        if (fd < 0) {
                 fprintf(stderr, "open \"%s\" failed: %s\n", parent,
                         strerror(errno));
                 return -1;
         }
 
-        rc = ioctl(tmpfd, IOC_MDC_LOOKUP, buf);
+        rc = ioctl(fd, IOC_MDC_LOOKUP, buf);
         if (rc < 0) {
                 fprintf(stderr, "error: %s: ioctl error: %s\n",
                         cmdname(argv[0]), strerror(rc = errno));
         }
-        close(tmpfd);
+        close(fd);
 
         if (verbose) {
                 IOC_UNPACK(argv[0], data);
@@ -1928,6 +1903,116 @@ int jt_obd_mdc_lookup(int argc, char **argv)
         return rc;
 }
 
+static 
+int do_add_uuid(char * func, char *uuid, ptl_nid_t nid, int nal) 
+{
+        char tmp[64];
+        int rc;
+        struct obd_ioctl_data data;
+
+        IOC_INIT(data);
+        data.ioc_nid = nid;
+        data.ioc_inllen1 = strlen(uuid) + 1;
+        data.ioc_inlbuf1 = uuid;
+        data.ioc_nal = nal;
+
+        IOC_PACK(func, data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_ADD_UUID, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_ADD_UUID failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+
+        printf ("Added uuid %s: %s\n", uuid, ptl_nid2str (tmp, nid));
+        return 0;
+}
+
+int jt_obd_add_uuid(int argc, char **argv)
+{
+        ptl_nid_t nid = 0;
+        int nal;
+        
+        if (argc != 4) {                
+                return CMD_HELP;
+        }
+
+        if (ptl_parse_nid (&nid, argv[2]) != 0) {
+                fprintf (stderr, "Can't parse NID %s\n", argv[2]);
+                        return (-1);
+        }
+
+        nal = ptl_name2nal(argv[3]);
+
+        if (nal == 0) {
+                fprintf (stderr, "Can't parse NAL %s\n", argv[3]);
+                return -1;
+        }
+
+        return do_add_uuid(argv[0], argv[1], nid, nal);
+}
+
+int jt_obd_close_uuid(int argc, char **argv)
+{
+        int rc, nal;
+        struct obd_ioctl_data data;
+
+        if (argc != 3) {
+                fprintf(stderr, "usage: %s <uuid>\n", argv[0]);
+                return 0;
+        }
+
+        nal = ptl_name2nal(argv[2]);
+
+        if (nal == 0) {
+                fprintf (stderr, "Can't parse NAL %s\n", argv[2]);
+                return -1;
+        }
+
+        IOC_INIT(data);
+        data.ioc_inllen1 = strlen(argv[1]) + 1;
+        data.ioc_inlbuf1 = argv[1];
+        data.ioc_nal = nal;
+
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_CLOSE_UUID, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_CLOSE_UUID failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+
+int jt_obd_del_uuid(int argc, char **argv)
+{
+        int rc;
+        struct obd_ioctl_data data;
+
+        if (argc != 2) {
+                fprintf(stderr, "usage: %s <uuid>\n", argv[0]);
+                return 0;
+        }
+
+        IOC_INIT(data);
+
+        if (strcmp (argv[1], "_all_"))
+        {
+                data.ioc_inllen1 = strlen(argv[1]) + 1;
+                data.ioc_inlbuf1 = argv[1];
+        }
+        
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_DEL_UUID, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_DEL_UUID failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
 static void signal_server(int sig)
 {
         if (sig == SIGINT) {
@@ -1940,6 +2025,8 @@ static void signal_server(int sig)
 int obd_initialize(int argc, char **argv)
 {
         SHMEM_SETUP();
+        register_ioc_dev(OBD_DEV_ID, OBD_DEV_PATH);
+
         return 0;
 }
 
index 911ab5f..3363824 100644 (file)
@@ -26,6 +26,7 @@
 #include <errno.h>
 #include <string.h>
 
+#include <liblustre.h>
 #include "obdiolib.h"
 
 int
index b8c210c..f0e1a97 100644 (file)
@@ -27,7 +27,7 @@
 #define _OBDCTL_H_
 
 int do_disconnect(char *func, int verbose);
-int obd_initialize(int argc, char **argv);
+ int obd_initialize(int argc, char **argv);
 void obd_cleanup(int argc, char **argv);
 
 int jt_opt_device(int argc, char **argv);
@@ -65,5 +65,8 @@ int jt_obd_newconn(int argc, char **argv);
 int jt_obd_failconn(int argc, char **argv);
 int jt_obd_mdc_lookup(int argc, char **argv);
 int jt_get_version(int argc, char **argv);
+int jt_obd_add_uuid(int argc, char **argv);
+int jt_obd_close_uuid(int argc, char **argv);
+int jt_obd_del_uuid(int argc, char **argv);
 
 #endif
index ccee788..65a4cac 100644 (file)
@@ -26,6 +26,7 @@
 #include <errno.h>
 #include <string.h>
 
+#include <liblustre.h>
 #include "obdiolib.h"
 
 int
index ef95055..0404808 100644 (file)
@@ -30,6 +30,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <liblustre.h>
 #include "obdiolib.h"
 
 void
index 1e23a31..01085b9 100644 (file)
@@ -154,6 +154,7 @@ do_stat (void)
                printf ("\n");
        }
 
+       fflush(stdout); 
        last = timenow();
 }