Whamcloud - gitweb
land b_eq on HEAD
authorrread <rread>
Sat, 14 Feb 2004 03:16:22 +0000 (03:16 +0000)
committerrread <rread>
Sat, 14 Feb 2004 03:16:22 +0000 (03:16 +0000)
168 files changed:
lnet/archdep.m4
lnet/include/lnet/errno.h
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-nal.h
lnet/include/lnet/lib-p30.h
lnet/include/lnet/lib-types.h
lnet/include/lnet/lnet.h
lnet/include/lnet/p30.h
lnet/include/lnet/types.h
lnet/klnds/gmlnd/gmlnd.h
lnet/klnds/gmlnd/gmlnd_cb.c
lnet/klnds/gmlnd/gmlnd_comm.c
lnet/klnds/iblnd/ibnal_cb.c
lnet/klnds/qswlnd/qswlnd_cb.c
lnet/klnds/scimaclnd/scimacnal_cb.c
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd_cb.c
lnet/libcfs/module.c
lnet/lnet/Makefile.am
lnet/lnet/api-eq.c
lnet/lnet/api-errno.c
lnet/lnet/api-ni.c
lnet/lnet/api-wrap.c
lnet/lnet/lib-init.c
lnet/lnet/lib-md.c
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c
lnet/ulnds/Makefile.am
lnet/ulnds/bridge.h
lnet/ulnds/connection.c
lnet/ulnds/connection.h
lnet/ulnds/procapi.c
lnet/ulnds/procbridge.h
lnet/ulnds/proclib.c
lnet/ulnds/select.c
lnet/ulnds/socklnd/Makefile.am
lnet/ulnds/socklnd/bridge.h
lnet/ulnds/socklnd/connection.c
lnet/ulnds/socklnd/connection.h
lnet/ulnds/socklnd/procapi.c
lnet/ulnds/socklnd/procbridge.h
lnet/ulnds/socklnd/proclib.c
lnet/ulnds/socklnd/select.c
lnet/ulnds/socklnd/tcplnd.c
lnet/ulnds/tcplnd.c
lnet/utils/Makefile.am
lnet/utils/l_ioctl.c
lnet/utils/portals.c
lustre/ChangeLog
lustre/Makefile.am
lustre/configure.in
lustre/include/liblustre.h
lustre/include/linux/lustre_dlm.h
lustre/include/linux/lustre_export.h
lustre/include/linux/lustre_idl.h
lustre/include/linux/lustre_net.h
lustre/include/linux/obd.h
lustre/kernel_patches/patches/2.6.0-test6-mm4.patch
lustre/kernel_patches/patches/bproc-patch-2.4.20
lustre/kernel_patches/patches/ext3-xattr-ptr-arith-fix.patch
lustre/ldlm/Makefile.am
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/liblustre/Makefile.am
lustre/liblustre/dir.c [new file with mode: 0644]
lustre/liblustre/file.c
lustre/liblustre/genlib.sh
lustre/liblustre/libtest.c [deleted file]
lustre/liblustre/llite_lib.c
lustre/liblustre/llite_lib.h
lustre/liblustre/namei.c
lustre/liblustre/rw.c
lustre/liblustre/super.c
lustre/liblustre/tests/.cvsignore [new file with mode: 0644]
lustre/liblustre/tests/Makefile.am [new file with mode: 0644]
lustre/liblustre/tests/echo_test.c [new file with mode: 0644]
lustre/liblustre/tests/recovery_small.c [moved from lustre/liblustre/recovery_small.c with 99% similarity]
lustre/liblustre/tests/replay_ost_single.c [new file with mode: 0644]
lustre/liblustre/tests/replay_single.c [moved from lustre/liblustre/replay_single.c with 99% similarity, mode: 0644]
lustre/liblustre/tests/sanity.c [moved from lustre/liblustre/lltest.c with 81% similarity]
lustre/liblustre/tests/test_common.c [moved from lustre/liblustre/test_common.c with 91% similarity]
lustre/liblustre/tests/test_common.h [moved from lustre/liblustre/test_common.h with 91% similarity]
lustre/liblustre/tests/test_lock_cancel.c [moved from lustre/liblustre/test_lock_cancel.c with 100% similarity]
lustre/llite/llite_lib.c
lustre/lov/Makefile.am
lustre/lvfs/Makefile.am
lustre/mdc/Makefile.am
lustre/mdc/mdc_request.c
lustre/mds/handler.c
lustre/mds/mds_internal.h
lustre/mds/mds_log.c
lustre/mds/mds_open.c
lustre/mds/mds_reint.c
lustre/mds/mds_unlink_open.c
lustre/mgmt/mgmt_svc.c
lustre/obdclass/Makefile.am
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdclass/llog_lvfs.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/obd_config.c
lustre/obdecho/Makefile.am
lustre/obdecho/echo.c
lustre/obdecho/echo_client.c
lustre/osc/Makefile.am
lustre/osc/osc_internal.h
lustre/osc/osc_request.c
lustre/ost/ost_handler.c
lustre/portals/archdep.m4
lustre/portals/include/portals/errno.h
lustre/portals/include/portals/lib-nal.h
lustre/portals/include/portals/lib-p30.h
lustre/portals/include/portals/lib-types.h
lustre/portals/include/portals/p30.h
lustre/portals/include/portals/types.h
lustre/portals/knals/gmnal/gmnal.h
lustre/portals/knals/gmnal/gmnal_cb.c
lustre/portals/knals/gmnal/gmnal_comm.c
lustre/portals/knals/ibnal/ibnal_cb.c
lustre/portals/knals/qswnal/qswnal_cb.c
lustre/portals/knals/scimacnal/scimacnal_cb.c
lustre/portals/knals/socknal/socknal.c
lustre/portals/knals/socknal/socknal_cb.c
lustre/portals/libcfs/module.c
lustre/portals/portals/Makefile.am
lustre/portals/portals/api-eq.c
lustre/portals/portals/api-errno.c
lustre/portals/portals/api-ni.c
lustre/portals/portals/api-wrap.c
lustre/portals/portals/lib-init.c
lustre/portals/portals/lib-md.c
lustre/portals/portals/lib-move.c
lustre/portals/portals/lib-msg.c
lustre/portals/unals/Makefile.am
lustre/portals/unals/bridge.h
lustre/portals/unals/connection.c
lustre/portals/unals/connection.h
lustre/portals/unals/procapi.c
lustre/portals/unals/procbridge.h
lustre/portals/unals/proclib.c
lustre/portals/unals/select.c
lustre/portals/unals/tcpnal.c
lustre/portals/utils/Makefile.am
lustre/portals/utils/l_ioctl.c
lustre/portals/utils/portals.c
lustre/ptlbd/rpc.c
lustre/ptlbd/server.c
lustre/ptlrpc/Makefile.am
lustre/ptlrpc/client.c
lustre/ptlrpc/events.c
lustre/ptlrpc/import.c
lustre/ptlrpc/llog_net.c
lustre/ptlrpc/llog_server.c
lustre/ptlrpc/lproc_ptlrpc.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/pinger.c
lustre/ptlrpc/ptlrpc_internal.h
lustre/ptlrpc/ptlrpc_module.c
lustre/ptlrpc/ptlrpcd.c
lustre/ptlrpc/recover.c
lustre/ptlrpc/service.c
lustre/scripts/lustre.spec.in
lustre/tests/conf-sanity.sh
lustre/tests/replay-ost-single.sh
lustre/utils/Makefile.am
lustre/utils/lrun

index e955c33..c06bc8a 100644 (file)
@@ -333,6 +333,7 @@ AC_SUBST(SCIMACNAL)
 CFLAGS="$KCFLAGS"
 CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac $with_ib"
 
+AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
 AC_SUBST(MOD_LINK)
 AC_SUBST(LINUX25)
 AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
index 817936a..08f084a 100644 (file)
@@ -50,9 +50,8 @@ typedef enum {
         PTL_IOV_TOO_SMALL   = 31,
 
        PTL_EQ_INUSE        = 32,
-       PTL_MD_INUSE        = 33,
 
-        PTL_MAX_ERRNO       = 33
+        PTL_MAX_ERRNO       = 32
 } ptl_err_t;
 /* If you change these, you must update the string table in api-errno.c */
 
index 3582b94..e9e4635 100644 (file)
@@ -19,7 +19,6 @@
 #include <portals/types.h>
 #include <linux/kp30.h>
 #include <portals/p30.h>
-#include <portals/errno.h>
 #include <portals/lib-types.h>
 #include <portals/lib-nal.h>
 #include <portals/lib-dispatch.h>
@@ -42,7 +41,7 @@ do {                                                    \
         nal->cb_sti(nal, flagsp);                       \
 }
 
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
 
 #define MAX_MES         2048
 #define MAX_MDS         2048
@@ -98,7 +97,7 @@ lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
 }
 
 static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
 {
         /* NEVER called with statelock held */
         unsigned long  flags;
@@ -142,8 +141,20 @@ lib_me_free (nal_cb_t *nal, lib_me_t *me)
 static inline lib_msg_t *
 lib_msg_alloc (nal_cb_t *nal)
 {
-        /* ALWAYS called with statelock held */
-        return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_msg_t     *msg;
+        
+        state_lock (nal, &flags);
+        msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs);
+        state_unlock (nal, &flags);
+
+        if (msg != NULL) {
+                /* NULL pointers, clear flags etc */
+                memset (msg, 0, sizeof (*msg));
+                msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+        }
+        return(msg);
 }
 
 static inline void
@@ -155,22 +166,13 @@ lib_msg_free (nal_cb_t *nal, lib_msg_t *msg)
 
 #else
 
-extern atomic_t      md_in_use_count;
-extern atomic_t      msg_in_use_count;
-extern atomic_t      me_in_use_count;
-extern atomic_t      eq_in_use_count;
-
 static inline lib_eq_t *
 lib_eq_alloc (nal_cb_t *nal)
 {
         /* NEVER called with statelock held */
         lib_eq_t *eq;
-        PORTAL_ALLOC(eq, sizeof(*eq));
-
-        if (eq == NULL)
-                return (NULL);
 
-        atomic_inc (&eq_in_use_count);
+        PORTAL_ALLOC(eq, sizeof(*eq));
         return (eq);
 }
 
@@ -178,21 +180,34 @@ static inline void
 lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&eq_in_use_count);
         PORTAL_FREE(eq, sizeof(*eq));
 }
 
 static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
 {
         /* NEVER called with statelock held */
         lib_md_t *md;
-        PORTAL_ALLOC(md, sizeof(*md));
-
-        if (md == NULL)
-                return (NULL);
-
-        atomic_inc (&md_in_use_count);
+        int       size;
+        int       niov;
+
+        if ((umd->options & PTL_MD_KIOV) != 0) {
+                niov = umd->niov;
+                size = offsetof(lib_md_t, md_iov.kiov[niov]);
+        } else {
+                niov = ((umd->options & PTL_MD_IOV) != 0) ?
+                       umd->niov : 1;
+                size = offsetof(lib_md_t, md_iov.iov[niov]);
+        }
+
+        PORTAL_ALLOC(md, size);
+
+        if (md != NULL) {
+                /* Set here in case of early free */
+                md->options = umd->options;
+                md->md_niov = niov;
+        }
+        
         return (md);
 }
 
@@ -200,8 +215,14 @@ static inline void
 lib_md_free (nal_cb_t *nal, lib_md_t *md)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&md_in_use_count);
-        PORTAL_FREE(md, sizeof(*md));
+        int       size;
+
+        if ((md->options & PTL_MD_KIOV) != 0)
+                size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]);
+        else
+                size = offsetof(lib_md_t, md_iov.iov[md->md_niov]);
+
+        PORTAL_FREE(md, size);
 }
 
 static inline lib_me_t *
@@ -209,12 +230,8 @@ lib_me_alloc (nal_cb_t *nal)
 {
         /* NEVER called with statelock held */
         lib_me_t *me;
-        PORTAL_ALLOC(me, sizeof(*me));
-
-        if (me == NULL)
-                return (NULL);
 
-        atomic_inc (&me_in_use_count);
+        PORTAL_ALLOC(me, sizeof(*me));
         return (me);
 }
 
@@ -222,21 +239,21 @@ static inline void
 lib_me_free(nal_cb_t *nal, lib_me_t *me)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&me_in_use_count);
         PORTAL_FREE(me, sizeof(*me));
 }
 
 static inline lib_msg_t *
 lib_msg_alloc(nal_cb_t *nal)
 {
-        /* ALWAYS called with statelock held */
+        /* NEVER called with statelock held */
         lib_msg_t *msg;
-        PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg));
 
-        if (msg == NULL)
-                return (NULL);
-        
-        atomic_inc (&msg_in_use_count);
+        PORTAL_ALLOC(msg, sizeof(*msg));
+        if (msg != NULL) {
+                /* NULL pointers, clear flags etc */
+                memset (msg, 0, sizeof (*msg));
+                msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+        }
         return (msg);
 }
 
@@ -244,7 +261,6 @@ static inline void
 lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&msg_in_use_count);
         PORTAL_FREE(msg, sizeof(*msg));
 }
 #endif
@@ -344,26 +360,41 @@ extern char *dispatch_name(int index);
  * Call backs will be made to write events, send acks or
  * replies and so on.
  */
-extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
-extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void lib_enq_event_locked (nal_cb_t *nal, void *private,
+                                  lib_eq_t *eq, ptl_event_t *ev);
+extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, 
+                          ptl_err_t status);
+extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private);
 extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, 
                                       lib_md_t *getmd);
-extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr);
+
 
 extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
-extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
-extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, 
+                              ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, 
+                              char *src, ptl_size_t len);
+extern int lib_extract_iov (int dst_niov, struct iovec *dst,
+                            int src_niov, struct iovec *src,
+                            ptl_size_t offset, ptl_size_t len);
 
 extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
-extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
-extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, 
+                               ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+                               char *src, ptl_size_t len);
+extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, 
+                             int src_niov, ptl_kiov_t *src,
+                             ptl_size_t offset, ptl_size_t len);
+
 extern void lib_assert_wire_constants (void);
 
-extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
-                      ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
-extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
-                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-                     lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+                           ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                           ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                           lib_md_t *md, ptl_size_t offset, ptl_size_t len);
 
 extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
                                ptl_md_t * md_out);
index 4052c0c..0bf557e 100644 (file)
@@ -18,47 +18,60 @@ struct nal_cb_t {
        lib_ni_t ni;
        void *nal_data;
        /*
-        * send:  Sends a preformatted header and user data to a
-        * specified remote process.
-        * Can overwrite iov.
+        * send: Sends a preformatted header and payload data to a
+        * specified remote process. The payload is scattered over 'niov'
+        * fragments described by iov, starting at 'offset' for 'mlen'
+        * bytes.  
+        * NB the NAL may NOT overwrite iov.  
+        * PTL_OK on success => NAL has committed to send and will call
+        * lib_finalize on completion
         */
-       int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
-                       ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
-                       unsigned int niov, struct iovec *iov, size_t mlen);
+       ptl_err_t (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                             ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                             unsigned int niov, struct iovec *iov, 
+                             size_t offset, size_t mlen);
 
        /* as send, but with a set of page fragments (NULL if not supported) */
-       int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
-                             ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
-                             unsigned int niov, ptl_kiov_t *iov, size_t mlen);
+       ptl_err_t (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                                   ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                                   unsigned int niov, ptl_kiov_t *iov, 
+                                   size_t offset, size_t mlen);
        /*
-        * recv: Receives an incoming message from a remote process
-        * Type of iov depends on options.  Can overwrite iov.
+        * recv: Receives an incoming message from a remote process.  The
+        * payload is to be received into the scattered buffer of 'niov'
+        * fragments described by iov, starting at 'offset' for 'mlen'
+        * bytes.  Payload bytes after 'mlen' up to 'rlen' are to be
+        * discarded.  
+        * NB the NAL may NOT overwrite iov.
+        * PTL_OK on success => NAL has committed to receive and will call
+        * lib_finalize on completion
         */
-       int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
-                       unsigned int niov, struct iovec *iov, size_t mlen
-                       size_t rlen);
+       ptl_err_t (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                             unsigned int niov, struct iovec *iov
+                             size_t offset, size_t mlen, size_t rlen);
 
        /* as recv, but with a set of page fragments (NULL if not supported) */
-       int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
-                             unsigned int niov, ptl_kiov_t *iov, size_t mlen
-                             size_t rlen);
+       ptl_err_t (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                                   unsigned int niov, ptl_kiov_t *iov
+                                   size_t offset, size_t mlen, size_t rlen);
        /*
         * read: Reads a block of data from a specified user address
         */
-       int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
-                       user_ptr src_addr, size_t len);
+       ptl_err_t (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
+                             user_ptr src_addr, size_t len);
 
        /*
         * write: Writes a block of data into a specified user address
         */
-       int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
-                        void *src_addr, size_t len);
+       ptl_err_t (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
+                              void *src_addr, size_t len);
 
        /*
         * callback: Calls an event callback
+        * NULL => lib calls eq's callback (if any) directly.
         */
-       int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
-                        ptl_event_t *ev);
+       void (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                            ptl_event_t *ev);
 
        /*
         *  malloc: Acquire a block of memory in a system independent
@@ -74,14 +87,14 @@ struct nal_cb_t {
         * type of *iov depends on options.
         * Set to NULL if not required.
         */
-       int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
-                      void **addrkey);
+       ptl_err_t (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
+                            void **addrkey);
        void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
                          void **addrkey);
 
        /* as (un)map, but with a set of page fragments */
-       int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
-                            void **addrkey);
+       ptl_err_t (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
+                                  void **addrkey);
        void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
                          void **addrkey);
 
index 3582b94..e9e4635 100644 (file)
@@ -19,7 +19,6 @@
 #include <portals/types.h>
 #include <linux/kp30.h>
 #include <portals/p30.h>
-#include <portals/errno.h>
 #include <portals/lib-types.h>
 #include <portals/lib-nal.h>
 #include <portals/lib-dispatch.h>
@@ -42,7 +41,7 @@ do {                                                    \
         nal->cb_sti(nal, flagsp);                       \
 }
 
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
 
 #define MAX_MES         2048
 #define MAX_MDS         2048
@@ -98,7 +97,7 @@ lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
 }
 
 static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
 {
         /* NEVER called with statelock held */
         unsigned long  flags;
@@ -142,8 +141,20 @@ lib_me_free (nal_cb_t *nal, lib_me_t *me)
 static inline lib_msg_t *
 lib_msg_alloc (nal_cb_t *nal)
 {
-        /* ALWAYS called with statelock held */
-        return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_msg_t     *msg;
+        
+        state_lock (nal, &flags);
+        msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs);
+        state_unlock (nal, &flags);
+
+        if (msg != NULL) {
+                /* NULL pointers, clear flags etc */
+                memset (msg, 0, sizeof (*msg));
+                msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+        }
+        return(msg);
 }
 
 static inline void
@@ -155,22 +166,13 @@ lib_msg_free (nal_cb_t *nal, lib_msg_t *msg)
 
 #else
 
-extern atomic_t      md_in_use_count;
-extern atomic_t      msg_in_use_count;
-extern atomic_t      me_in_use_count;
-extern atomic_t      eq_in_use_count;
-
 static inline lib_eq_t *
 lib_eq_alloc (nal_cb_t *nal)
 {
         /* NEVER called with statelock held */
         lib_eq_t *eq;
-        PORTAL_ALLOC(eq, sizeof(*eq));
-
-        if (eq == NULL)
-                return (NULL);
 
-        atomic_inc (&eq_in_use_count);
+        PORTAL_ALLOC(eq, sizeof(*eq));
         return (eq);
 }
 
@@ -178,21 +180,34 @@ static inline void
 lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&eq_in_use_count);
         PORTAL_FREE(eq, sizeof(*eq));
 }
 
 static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
 {
         /* NEVER called with statelock held */
         lib_md_t *md;
-        PORTAL_ALLOC(md, sizeof(*md));
-
-        if (md == NULL)
-                return (NULL);
-
-        atomic_inc (&md_in_use_count);
+        int       size;
+        int       niov;
+
+        if ((umd->options & PTL_MD_KIOV) != 0) {
+                niov = umd->niov;
+                size = offsetof(lib_md_t, md_iov.kiov[niov]);
+        } else {
+                niov = ((umd->options & PTL_MD_IOV) != 0) ?
+                       umd->niov : 1;
+                size = offsetof(lib_md_t, md_iov.iov[niov]);
+        }
+
+        PORTAL_ALLOC(md, size);
+
+        if (md != NULL) {
+                /* Set here in case of early free */
+                md->options = umd->options;
+                md->md_niov = niov;
+        }
+        
         return (md);
 }
 
@@ -200,8 +215,14 @@ static inline void
 lib_md_free (nal_cb_t *nal, lib_md_t *md)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&md_in_use_count);
-        PORTAL_FREE(md, sizeof(*md));
+        int       size;
+
+        if ((md->options & PTL_MD_KIOV) != 0)
+                size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]);
+        else
+                size = offsetof(lib_md_t, md_iov.iov[md->md_niov]);
+
+        PORTAL_FREE(md, size);
 }
 
 static inline lib_me_t *
@@ -209,12 +230,8 @@ lib_me_alloc (nal_cb_t *nal)
 {
         /* NEVER called with statelock held */
         lib_me_t *me;
-        PORTAL_ALLOC(me, sizeof(*me));
-
-        if (me == NULL)
-                return (NULL);
 
-        atomic_inc (&me_in_use_count);
+        PORTAL_ALLOC(me, sizeof(*me));
         return (me);
 }
 
@@ -222,21 +239,21 @@ static inline void
 lib_me_free(nal_cb_t *nal, lib_me_t *me)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&me_in_use_count);
         PORTAL_FREE(me, sizeof(*me));
 }
 
 static inline lib_msg_t *
 lib_msg_alloc(nal_cb_t *nal)
 {
-        /* ALWAYS called with statelock held */
+        /* NEVER called with statelock held */
         lib_msg_t *msg;
-        PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg));
 
-        if (msg == NULL)
-                return (NULL);
-        
-        atomic_inc (&msg_in_use_count);
+        PORTAL_ALLOC(msg, sizeof(*msg));
+        if (msg != NULL) {
+                /* NULL pointers, clear flags etc */
+                memset (msg, 0, sizeof (*msg));
+                msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+        }
         return (msg);
 }
 
@@ -244,7 +261,6 @@ static inline void
 lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&msg_in_use_count);
         PORTAL_FREE(msg, sizeof(*msg));
 }
 #endif
@@ -344,26 +360,41 @@ extern char *dispatch_name(int index);
  * Call backs will be made to write events, send acks or
  * replies and so on.
  */
-extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
-extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void lib_enq_event_locked (nal_cb_t *nal, void *private,
+                                  lib_eq_t *eq, ptl_event_t *ev);
+extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, 
+                          ptl_err_t status);
+extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private);
 extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, 
                                       lib_md_t *getmd);
-extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr);
+
 
 extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
-extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
-extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, 
+                              ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, 
+                              char *src, ptl_size_t len);
+extern int lib_extract_iov (int dst_niov, struct iovec *dst,
+                            int src_niov, struct iovec *src,
+                            ptl_size_t offset, ptl_size_t len);
 
 extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
-extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
-extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, 
+                               ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+                               char *src, ptl_size_t len);
+extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, 
+                             int src_niov, ptl_kiov_t *src,
+                             ptl_size_t offset, ptl_size_t len);
+
 extern void lib_assert_wire_constants (void);
 
-extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
-                      ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
-extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
-                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-                     lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+                           ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                           ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                           lib_md_t *md, ptl_size_t offset, ptl_size_t len);
 
 extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
                                ptl_md_t * md_out);
index d9e3c11..904204b 100644 (file)
@@ -16,7 +16,7 @@
 # include <linux/smp_lock.h>
 # include <linux/types.h>
 #else
-# define PTL_USE_DESC_LISTS
+# define PTL_USE_LIB_FREELIST
 # include <sys/types.h>
 #endif
 
@@ -139,16 +139,9 @@ typedef struct {
 
 struct lib_msg_t {
         struct list_head  msg_list;
-        int               send_ack;
         lib_md_t         *md;
-        ptl_nid_t         nid;
-        ptl_pid_t         pid;
-        ptl_event_t       ev;
         ptl_handle_wire_t ack_wmd;
-        union {
-                struct iovec  iov[PTL_MD_MAX_IOV];
-                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
-        } msg_iov;
+        ptl_event_t       ev;
 };
 
 struct lib_ptl_t {
@@ -212,9 +205,8 @@ struct lib_md_t {
 };
 
 #define PTL_MD_FLAG_UNLINK            (1 << 0)
-#define PTL_MD_FLAG_AUTO_UNLINKED     (1 << 1)
 
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
 typedef struct
 {
         void             *fl_objs;             /* single contiguous array of objects */
@@ -262,7 +254,7 @@ typedef struct {
         
         struct list_head ni_test_peers;
         
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
         lib_freelist_t   ni_free_mes;
         lib_freelist_t   ni_free_msgs;
         lib_freelist_t   ni_free_mds;
index a4ea39b..8b1495e 100644 (file)
@@ -21,7 +21,6 @@
 #include <portals/types.h>
 #include <portals/nal.h>
 #include <portals/api.h>
-#include <portals/errno.h>
 #include <portals/nalids.h>
 
 extern int __p30_initialized;  /* for libraries & test codes  */
index a4ea39b..8b1495e 100644 (file)
@@ -21,7 +21,6 @@
 #include <portals/types.h>
 #include <portals/nal.h>
 #include <portals/api.h>
-#include <portals/errno.h>
 #include <portals/nalids.h>
 
 extern int __p30_initialized;  /* for libraries & test codes  */
index e4ccebf..7ffe797 100644 (file)
@@ -17,6 +17,8 @@ typedef u_int64_t __u64;
 # define do_gettimeofday(tv) gettimeofday(tv, NULL)
 #endif
 
+#include <portals/errno.h>
+
 typedef __u64 ptl_nid_t;
 typedef __u32 ptl_pid_t;
 typedef __u32 ptl_pt_index_t;
@@ -97,7 +99,8 @@ typedef enum {
         PTL_EVENT_PUT,
         PTL_EVENT_REPLY,
         PTL_EVENT_ACK,
-        PTL_EVENT_SENT
+        PTL_EVENT_SENT,
+       PTL_EVENT_UNLINK,
 } ptl_event_kind_t;
 
 #define PTL_SEQ_BASETYPE       long
@@ -112,15 +115,19 @@ typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t;
 #pragma pack(push, 4)
 #endif
 typedef struct {
-        ptl_event_kind_t type;
-        ptl_process_id_t initiator;
-        ptl_pt_index_t portal;
-        ptl_match_bits_t match_bits;
-        ptl_size_t rlength, mlength, offset;
-        ptl_handle_me_t unlinked_me;
-        ptl_md_t mem_desc;
-        ptl_hdr_data_t hdr_data;
-        struct timeval arrival_time;
+        ptl_event_kind_t   type;
+       ptl_err_t          status;
+       int                unlinked;
+        ptl_process_id_t   initiator;
+        ptl_pt_index_t     portal;
+        ptl_match_bits_t   match_bits;
+        ptl_size_t         rlength;
+       ptl_size_t         mlength;
+       ptl_size_t         offset;
+        ptl_md_t           mem_desc;
+        ptl_hdr_data_t     hdr_data;
+        struct timeval     arrival_time;
+
         volatile ptl_seq_t sequence;
 } ptl_event_t;
 #ifdef __CYGWIN__
index 53757ab..cdde5b7 100644 (file)
@@ -353,8 +353,6 @@ int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t);
 
 int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t);
 
-int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *);
-
 void *gmnal_cb_malloc(nal_cb_t *, size_t);
 
 void gmnal_cb_free(nal_cb_t *, void *, size_t);
@@ -384,7 +382,7 @@ void  gmnal_fini(void);
                                a->cb_recv_pages = gmnal_cb_recv_pages; \
                                a->cb_read = gmnal_cb_read; \
                                a->cb_write = gmnal_cb_write; \
-                               a->cb_callback = gmnal_cb_callback; \
+                               a->cb_callback = NULL; \
                                a->cb_malloc = gmnal_cb_malloc; \
                                a->cb_free = gmnal_cb_free; \
                                a->cb_map = NULL; \
index 6ae91db..e055242 100644 (file)
@@ -126,7 +126,6 @@ int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
                                niov, iov, len);
        } else {
                CDEBUG(D_ERROR, "Large message send it is not supported\n");
-               lib_finalize(nal_cb, private, cookie);
                return(PTL_FAIL);
                gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, 
                                niov, iov, len);
@@ -200,18 +199,6 @@ int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst,
        return(PTL_OK);
 }
 
-int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, 
-                      ptl_event_t *ev)
-{
-
-       if (eq->event_callback != NULL) {
-               CDEBUG(D_INFO, "found callback\n");
-               eq->event_callback(ev);
-       }
-       
-       return(PTL_OK);
-}
-
 void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len)
 {
        void *ptr = NULL;
index 4171df6..a0d3530 100644 (file)
@@ -321,7 +321,6 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
 
        if (!private) {
                CDEBUG(D_ERROR, "gmnal_small_rx no context\n");
-               lib_finalize(nal_cb, private, cookie);
                return(PTL_FAIL);
        }
 
@@ -343,10 +342,8 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
         *      let portals library know receive is complete
         */
        CDEBUG(D_PORTALS, "calling lib_finalize\n");
-       if (lib_finalize(nal_cb, private, cookie) != PTL_OK) {
-               /* TO DO what to do with failed lib_finalise? */
-               CDEBUG(D_INFO, "lib_finalize failed\n");
-       }
+       lib_finalize(nal_cb, private, cookie, PTL_OK);
+
        /*
         *      return buffer so it can be used again
         */
@@ -590,10 +587,8 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status)
                return;
        }
        gmnal_return_stxd(nal_data, stxd);
-       if (lib_finalize(nal_cb, stxd, cookie) != PTL_OK) {
-               CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n", 
-                      stxd);
-       }
+       lib_finalize(nal_cb, stxd, cookie, PTL_OK);
+
        return;
 }
 
@@ -817,7 +812,6 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
 
        if (!srxd) {
                CDEBUG(D_ERROR, "gmnal_large_rx no context\n");
-               lib_finalize(nal_cb, private, cookie);
                return(PTL_FAIL);
        }
 
@@ -1114,10 +1108,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context,
         *      Let our client application proceed
         */     
        CDEBUG(D_ERROR, "final callback context[%p]\n", srxd);
-       if (lib_finalize(nal_cb, srxd, srxd->cookie) != PTL_OK) {
-               CDEBUG(D_INFO, "Call to lib_finalize failed for srxd [%p]\n", 
-                      srxd);
-       }
+       lib_finalize(nal_cb, srxd, srxd->cookie, PTL_OK);
 
        /*
         *      send an ack to the sender to let him know we got the data
@@ -1282,10 +1273,7 @@ gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd)
 
        CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd);
 
-       if (lib_finalize(nal_cb, stxd, stxd->cookie) != PTL_OK) {
-               CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n", 
-                      stxd);
-       }
+       lib_finalize(nal_cb, stxd, stxd->cookie, PTL_OK);
 
        /*
         *      extract the iovec from the stxd, deregister the memory.
index 2c07cc4..0688062 100644 (file)
@@ -306,7 +306,7 @@ kibnal_send(nal_cb_t        *nal,
           if(buf_length > MAX_MSG_SIZE) { 
              CERROR("kibnal_send:request exceeds Transmit data size (%d).\n",
                       MAX_MSG_SIZE);
-             rc = -1;
+             rc = PTL_FAIL;
              return rc;
           }
           else {
@@ -363,7 +363,7 @@ kibnal_send(nal_cb_t        *nal,
 
         PROF_FINISH(kibnal_send); // time stapm of send operation 
 
-        rc = 1;
+        rc = PTL_OK;
 
         return rc; 
 }
@@ -386,7 +386,7 @@ int kibnal_send_pages(nal_cb_t * nal,
                       ptl_kiov_t *iov, 
                       size_t mlen)
 {
-   int rc = 1;
+   int rc = PTL_FAIL;
 
    CDEBUG(D_NET, "kibnal_send_pages\n");
 
@@ -420,7 +420,7 @@ void kibnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
 //
 // do you need this 
 //
-int kibnal_callback(nal_cb_t * nal, 
+void kibnal_callback(nal_cb_t * nal, 
                            void *private, 
                            lib_eq_t *eq,
                            ptl_event_t *ev)
@@ -507,7 +507,7 @@ kibnal_recv_pages(nal_cb_t * nal,
 {
 
   CDEBUG(D_NET, "recv_pages not implemented\n");
-  return PTL_OK;
+  return PTL_FAIL;
        
 }
 
@@ -526,11 +526,12 @@ kibnal_recv(nal_cb_t     *nal,
         CDEBUG(D_NET,"kibnal_recv: mlen=%d, rlen=%d\n", mlen, rlen);
 
         /* What was actually received must be >= what sender claims to
-         * have sent.  This is an LASSERT, since lib-move doesn't
-         * check cb return code yet. */
-        LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
+         * have sent. */
         LASSERT (mlen <= rlen);
 
+        if (krx->krx_len < sizeof (ptl_hdr_t) + rlen)
+                return (PTL_FAIL);
+
         PROF_START(kibnal_recv);
 
         if(mlen != 0) {
@@ -542,12 +543,12 @@ kibnal_recv(nal_cb_t     *nal,
 
         PROF_START(lib_finalize);
         
-        lib_finalize(nal, private, cookie);
+        lib_finalize(nal, private, cookie, PTL_OK);
         
         PROF_FINISH(lib_finalize);
         PROF_FINISH(kibnal_recv);
 
-        return rlen;
+        return PTL_OK;
 }
 
 //
index 96749cd..4c2bd6a 100644 (file)
@@ -33,7 +33,7 @@ EP_STATUSBLK  kqswnal_rpc_failed;
  *  LIB functions follow
  *
  */
-static int
+static ptl_err_t
 kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
              size_t len)
 {
@@ -41,10 +41,10 @@ kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
                 nal->ni.nid, len, src_addr, dst_addr );
         memcpy( dst_addr, src_addr, len );
 
-        return (0);
+        return (PTL_OK);
 }
 
-static int
+static ptl_err_t
 kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
               size_t len)
 {
@@ -52,7 +52,7 @@ kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
                 nal->ni.nid, len, src_addr, dst_addr );
         memcpy( dst_addr, src_addr, len );
 
-        return (0);
+        return (PTL_OK);
 }
 
 static void *
@@ -157,13 +157,12 @@ kqswnal_unmap_tx (kqswnal_tx_t *ktx)
         elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState,
                           kqswnal_data.kqn_eptxdmahandle,
                           ktx->ktx_basepage, ktx->ktx_nmappedpages);
-
 #endif
         ktx->ktx_nmappedpages = 0;
 }
 
 int
-kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
+kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_t *kiov)
 {
         int       nfrags    = ktx->ktx_nfrag;
         int       nmapped   = ktx->ktx_nmappedpages;
@@ -188,8 +187,16 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
         LASSERT (niov > 0);
         LASSERT (nob > 0);
 
+        /* skip complete frags before 'offset' */
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                kiov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+
         do {
-                int  fraglen = kiov->kiov_len;
+                int  fraglen = kiov->kiov_len - offset;
 
                 /* nob exactly spans the iovs */
                 LASSERT (fraglen <= nob);
@@ -212,7 +219,7 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
                 /* XXX this is really crap, but we'll have to kmap until
                  * EKC has a page (rather than vaddr) mapping interface */
 
-                ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
 
                 CDEBUG(D_NET,
                        "%p[%d] loading %p for %d, page %d, %d total\n",
@@ -257,6 +264,7 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
                 kiov++;
                 niov--;
                 nob -= fraglen;
+                offset = 0;
 
                 /* iov must not run out before end of data */
                 LASSERT (nob == 0 || niov > 0);
@@ -271,7 +279,8 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
 }
 
 int
-kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
+kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, 
+                    int niov, struct iovec *iov)
 {
         int       nfrags    = ktx->ktx_nfrag;
         int       nmapped   = ktx->ktx_nmappedpages;
@@ -295,8 +304,16 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
         LASSERT (niov > 0);
         LASSERT (nob > 0);
 
+        /* skip complete frags before offset */
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                iov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+        
         do {
-                int  fraglen = iov->iov_len;
+                int  fraglen = iov->iov_len - offset;
                 long npages  = kqswnal_pages_spanned (iov->iov_base, fraglen);
 
                 /* nob exactly spans the iovs */
@@ -317,12 +334,12 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
 
                 CDEBUG(D_NET,
                        "%p[%d] loading %p for %d, pages %d for %ld, %d total\n",
-                        ktx, nfrags, iov->iov_base, fraglen, basepage, npages,
-                        nmapped);
+                       ktx, nfrags, iov->iov_base + offset, fraglen, 
+                       basepage, npages, nmapped);
 
 #if MULTIRAIL_EKC
                 ep_dvma_load(kqswnal_data.kqn_ep, NULL,
-                             iov->iov_base, fraglen,
+                             iov->iov_base + offset, fraglen,
                              kqswnal_data.kqn_ep_tx_nmh, basepage,
                              &railmask, &ktx->ktx_frags[nfrags]);
 
@@ -336,7 +353,7 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
 #else
                 elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState,
                                        kqswnal_data.kqn_eptxdmahandle,
-                                       iov->iov_base, fraglen,
+                                       iov->iov_base + offset, fraglen,
                                        basepage, &ktx->ktx_frags[nfrags].Base);
 
                 if (nfrags > 0 &&                /* previous frag mapped */
@@ -357,6 +374,7 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
                 iov++;
                 niov--;
                 nob -= fraglen;
+                offset = 0;
 
                 /* iov must not run out before end of data */
                 LASSERT (nob == 0 || niov > 0);
@@ -483,7 +501,7 @@ void
 kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
 {
         lib_msg_t     *msg;
-        lib_msg_t     *repmsg;
+        lib_msg_t     *repmsg = NULL;
 
         switch (ktx->ktx_state) {
         case KTX_FORWARDING:       /* router asked me to forward this packet */
@@ -493,21 +511,29 @@ kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
 
         case KTX_SENDING:          /* packet sourced locally */
                 lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
-                              (lib_msg_t *)ktx->ktx_args[1]);
+                              (lib_msg_t *)ktx->ktx_args[1],
+                              (error == 0) ? PTL_OK : 
+                              (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL);
                 break;
 
         case KTX_GETTING:          /* Peer has DMA-ed direct? */
                 msg = (lib_msg_t *)ktx->ktx_args[1];
-                repmsg = NULL;
 
-                if (error == 0) 
+                if (error == 0) {
                         repmsg = lib_fake_reply_msg (&kqswnal_lib, 
                                                      ktx->ktx_nid, msg->md);
+                        if (repmsg == NULL)
+                                error = -ENOMEM;
+                }
                 
-                lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg);
-
-                if (repmsg != NULL) 
-                        lib_finalize (&kqswnal_lib, NULL, repmsg);
+                if (error == 0) {
+                        lib_finalize (&kqswnal_lib, ktx->ktx_args[0], 
+                                      msg, PTL_OK);
+                        lib_finalize (&kqswnal_lib, NULL, repmsg, PTL_OK);
+                } else {
+                        lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg,
+                                      (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL);
+                }
                 break;
 
         default:
@@ -533,7 +559,7 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
                         ktx->ktx_nid, status);
 
                 kqswnal_notify_peer_down(ktx);
-                status = -EIO;
+                status = -EHOSTDOWN;
 
         } else if (ktx->ktx_state == KTX_GETTING) {
                 /* RPC completed OK; what did our peer put in the status
@@ -745,7 +771,8 @@ kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv,
 
 int
 kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, 
-                   struct iovec *iov, ptl_kiov_t *kiov, int nob)
+                   struct iovec *iov, ptl_kiov_t *kiov, 
+                   int offset, int nob)
 {
         kqswnal_rx_t       *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
         char               *buffer = (char *)page_address(krx->krx_pages[0]);
@@ -779,9 +806,9 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag,
         /* Map the source data... */
         ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
         if (kiov != NULL)
-                rc = kqswnal_map_tx_kiov (ktx, nob, nfrag, kiov);
+                rc = kqswnal_map_tx_kiov (ktx, offset, nob, nfrag, kiov);
         else
-                rc = kqswnal_map_tx_iov (ktx, nob, nfrag, iov);
+                rc = kqswnal_map_tx_iov (ktx, offset, nob, nfrag, iov);
 
         if (rc != 0) {
                 CERROR ("Can't map source data: %d\n", rc);
@@ -846,7 +873,7 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag,
         return (-ECONNABORTED);
 }
 
-static int
+static ptl_err_t
 kqswnal_sendmsg (nal_cb_t     *nal,
                  void         *private,
                  lib_msg_t    *libmsg,
@@ -857,6 +884,7 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                  unsigned int  payload_niov,
                  struct iovec *payload_iov,
                  ptl_kiov_t   *payload_kiov,
+                 size_t        payload_offset,
                  size_t        payload_nob)
 {
         kqswnal_tx_t      *ktx;
@@ -865,6 +893,7 @@ kqswnal_sendmsg (nal_cb_t     *nal,
 #if KQSW_CHECKSUM
         int                i;
         kqsw_csum_t        csum;
+        int                sumoff;
         int                sumnob;
 #endif
         
@@ -928,9 +957,9 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 }
 
                 /* peer expects RPC completion with GET data */
-                rc = kqswnal_dma_reply (ktx,
-                                        payload_niov, payload_iov, 
-                                        payload_kiov, payload_nob);
+                rc = kqswnal_dma_reply (ktx, payload_niov, 
+                                        payload_iov, payload_kiov, 
+                                        payload_offset, payload_nob);
                 if (rc == 0)
                         return (PTL_OK);
                 
@@ -945,22 +974,39 @@ kqswnal_sendmsg (nal_cb_t     *nal,
 #if KQSW_CHECKSUM
         csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr));
         memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum));
-        for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) {
+        for (csum = 0, i = 0, sumoff = payload_offset, sumnob = payload_nob; sumnob > 0; i++) {
+                LASSERT(i < niov);
                 if (payload_kiov != NULL) {
                         ptl_kiov_t *kiov = &payload_kiov[i];
-                        char       *addr = ((char *)kmap (kiov->kiov_page)) +
-                                           kiov->kiov_offset;
-                        
-                        csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len));
-                        sumnob -= kiov->kiov_len;
+
+                        if (sumoff >= kiov->kiov_len) {
+                                sumoff -= kiov->kiov_len;
+                        } else {
+                                char *addr = ((char *)kmap (kiov->kiov_page)) +
+                                             kiov->kiov_offset + sumoff;
+                                int   fragnob = kiov->kiov_len - sumoff;
+
+                                csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob));
+                                sumnob -= fragnob;
+                                sumoff = 0;
+                                kunmap(kiov->kiov_page);
+                        }
                 } else {
                         struct iovec *iov = &payload_iov[i];
 
-                        csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len));
-                        sumnob -= iov->iov_len;
+                        if (sumoff > iov->iov_len) {
+                                sumoff -= iov->iov_len;
+                        } else {
+                                char *addr = iov->iov_base + sumoff;
+                                int   fragnob = iov->iov_len - sumoff;
+                                
+                                csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob));
+                                sumnob -= fragnob;
+                                sumoff = 0;
+                        }
                 }
         }
-        memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum));
+        memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum));
 #endif
         
         if (kqswnal_data.kqn_optimized_gets &&
@@ -987,10 +1033,10 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 ktx->ktx_state = KTX_GETTING;
 
                 if ((libmsg->md->options & PTL_MD_KIOV) != 0) 
-                        rc = kqswnal_map_tx_kiov (ktx, md->length,
+                        rc = kqswnal_map_tx_kiov (ktx, 0, md->length,
                                                   md->md_niov, md->md_iov.kiov);
                 else
-                        rc = kqswnal_map_tx_iov (ktx, md->length,
+                        rc = kqswnal_map_tx_iov (ktx, 0, md->length,
                                                  md->md_niov, md->md_iov.iov);
 
                 if (rc < 0) {
@@ -1033,10 +1079,12 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 if (payload_nob > 0) {
                         if (payload_kiov != NULL)
                                 lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
-                                                   payload_niov, payload_kiov, payload_nob);
+                                                   payload_niov, payload_kiov, 
+                                                   payload_offset, payload_nob);
                         else
                                 lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
-                                                  payload_niov, payload_iov, payload_nob);
+                                                  payload_niov, payload_iov, 
+                                                  payload_offset, payload_nob);
                 }
         } else {
 
@@ -1052,10 +1100,10 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 ktx->ktx_frags[0].Len = KQSW_HDR_SIZE;
 #endif
                 if (payload_kiov != NULL)
-                        rc = kqswnal_map_tx_kiov (ktx, payload_nob, 
+                        rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob, 
                                                   payload_niov, payload_kiov);
                 else
-                        rc = kqswnal_map_tx_iov (ktx, payload_nob,
+                        rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob,
                                                  payload_niov, payload_iov);
                 if (rc != 0) {
                         kqswnal_put_idle_tx (ktx);
@@ -1078,7 +1126,7 @@ kqswnal_sendmsg (nal_cb_t     *nal,
         return (PTL_OK);
 }
 
-static int
+static ptl_err_t
 kqswnal_send (nal_cb_t     *nal,
               void         *private,
               lib_msg_t    *libmsg,
@@ -1088,13 +1136,15 @@ kqswnal_send (nal_cb_t     *nal,
               ptl_pid_t     pid,
               unsigned int  payload_niov,
               struct iovec *payload_iov,
+              size_t        payload_offset,
               size_t        payload_nob)
 {
         return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid,
-                                 payload_niov, payload_iov, NULL, payload_nob));
+                                 payload_niov, payload_iov, NULL, 
+                                 payload_offset, payload_nob));
 }
 
-static int
+static ptl_err_t
 kqswnal_send_pages (nal_cb_t     *nal,
                     void         *private,
                     lib_msg_t    *libmsg,
@@ -1104,10 +1154,12 @@ kqswnal_send_pages (nal_cb_t     *nal,
                     ptl_pid_t     pid,
                     unsigned int  payload_niov,
                     ptl_kiov_t   *payload_kiov,
+                    size_t        payload_offset,
                     size_t        payload_nob)
 {
         return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid,
-                                 payload_niov, NULL, payload_kiov, payload_nob));
+                                 payload_niov, NULL, payload_kiov, 
+                                 payload_offset, payload_nob));
 }
 
 void
@@ -1161,7 +1213,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
             nob <= KQSW_TX_BUFFER_SIZE) 
         {
                 /* send from ktx's pre-mapped contiguous buffer? */
-                lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob);
+                lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, 0, nob);
 #if MULTIRAIL_EKC
                 ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
                               0, nob);
@@ -1176,7 +1228,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
         {
                 /* zero copy */
                 ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
-                rc = kqswnal_map_tx_iov (ktx, nob, niov, iov);
+                rc = kqswnal_map_tx_iov (ktx, 0, nob, niov, iov);
                 if (rc != 0)
                         goto failed;
 
@@ -1231,7 +1283,8 @@ kqswnal_dma_reply_complete (EP_RXD *rxd)
         krx->krx_rpc_reply_needed = 0;
         kqswnal_rx_done (krx);
 
-        lib_finalize (&kqswnal_lib, NULL, msg);
+        lib_finalize (&kqswnal_lib, NULL, msg,
+                      (status == EP_SUCCESS) ? PTL_OK : PTL_FAIL);
         kqswnal_put_idle_tx (ktx);
 }
 
@@ -1461,13 +1514,14 @@ kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr)
 }
 #endif
 
-static int
+static ptl_err_t
 kqswnal_recvmsg (nal_cb_t     *nal,
                  void         *private,
                  lib_msg_t    *libmsg,
                  unsigned int  niov,
                  struct iovec *iov,
                  ptl_kiov_t   *kiov,
+                 size_t        offset,
                  size_t        mlen,
                  size_t        rlen)
 {
@@ -1498,10 +1552,13 @@ kqswnal_recvmsg (nal_cb_t     *nal,
 #endif
         CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen);
 
-        /* What was actually received must be >= payload.
-         * This is an LASSERT, as lib_finalize() doesn't have a completion status. */
-        LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen);
+        /* What was actually received must be >= payload. */
         LASSERT (mlen <= rlen);
+        if (krx->krx_nob < KQSW_HDR_SIZE + mlen) {
+                CERROR("Bad message size: have %d, need %d + %d\n",
+                       krx->krx_nob, KQSW_HDR_SIZE, mlen);
+                return (PTL_FAIL);
+        }
 
         /* It must be OK to kmap() if required */
         LASSERT (kiov == NULL || !in_interrupt ());
@@ -1516,20 +1573,37 @@ kqswnal_recvmsg (nal_cb_t     *nal,
                 page_nob = PAGE_SIZE - KQSW_HDR_SIZE;
 
                 LASSERT (niov > 0);
+                
                 if (kiov != NULL) {
-                        iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
-                        iov_nob = kiov->kiov_len;
+                        /* skip complete frags */
+                        while (offset >= kiov->kiov_len) {
+                                offset -= kiov->kiov_len;
+                                kiov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                        }
+                        iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
+                        iov_nob = kiov->kiov_len - offset;
                 } else {
-                        iov_ptr = iov->iov_base;
-                        iov_nob = iov->iov_len;
+                        /* skip complete frags */
+                        while (offset >= iov->iov_len) {
+                                offset -= iov->iov_len;
+                                iov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                        }
+                        iov_ptr = iov->iov_base + offset;
+                        iov_nob = iov->iov_len - offset;
                 }
-
+                
                 for (;;)
                 {
-                        /* We expect the iov to exactly match mlen */
-                        LASSERT (iov_nob <= mlen);
-                        
-                        frag = MIN (page_nob, iov_nob);
+                        frag = mlen;
+                        if (frag > page_nob)
+                                frag = page_nob;
+                        if (frag > iov_nob)
+                                frag = iov_nob;
+
                         memcpy (iov_ptr, page_ptr, frag);
 #if KQSW_CHECKSUM
                         payload_csum = kqsw_csum (payload_csum, iov_ptr, frag);
@@ -1588,33 +1662,39 @@ kqswnal_recvmsg (nal_cb_t     *nal,
                        "csum_nob %d\n",
                         hdr_csum, payload_csum, csum_frags, csum_nob);
 #endif
-        lib_finalize(nal, private, libmsg);
+        lib_finalize(nal, private, libmsg, PTL_OK);
 
-        return (rlen);
+        return (PTL_OK);
 }
 
-static int
+static ptl_err_t
 kqswnal_recv(nal_cb_t     *nal,
              void         *private,
              lib_msg_t    *libmsg,
              unsigned int  niov,
              struct iovec *iov,
+             size_t        offset,
              size_t        mlen,
              size_t        rlen)
 {
-        return (kqswnal_recvmsg (nal, private, libmsg, niov, iov, NULL, mlen, rlen));
+        return (kqswnal_recvmsg(nal, private, libmsg, 
+                                niov, iov, NULL, 
+                                offset, mlen, rlen));
 }
 
-static int
+static ptl_err_t
 kqswnal_recv_pages (nal_cb_t     *nal,
                     void         *private,
                     lib_msg_t    *libmsg,
                     unsigned int  niov,
                     ptl_kiov_t   *kiov,
+                    size_t        offset,
                     size_t        mlen,
                     size_t        rlen)
 {
-        return (kqswnal_recvmsg (nal, private, libmsg, niov, NULL, kiov, mlen, rlen));
+        return (kqswnal_recvmsg(nal, private, libmsg, 
+                                niov, NULL, kiov, 
+                                offset, mlen, rlen));
 }
 
 int
index b31c2ea..52afb98 100644 (file)
@@ -176,7 +176,8 @@ kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context)
                         break;
         }
 
-        lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie);
+        lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie,
+                     (err == 0) ? PTL_OK : PTL_FAIL);
 
         PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
 }
@@ -225,14 +226,14 @@ kscimacnal_sendmsg(nal_cb_t        *nal,
         if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) {
                 CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n",
                                 mac_get_mtusize(ksci->ksci_machandle));
-                return -EINVAL;
+                return PTL_FAIL;
         }
 
 
         /* save transaction info for later finalize and cleanup */
         PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t)));
         if (!ktx) {
-                return -ENOMEM;
+                return PTL_NOSPACE;
         }
 
         ktx->ktx_nmapped = 0; /* Start with no mapped pages :) */
@@ -247,7 +248,7 @@ kscimacnal_sendmsg(nal_cb_t        *nal,
                         kscimacnal_txrelease, ktx);
         if (!msg) {
                 PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
-                return -ENOMEM;
+                return PTL_NOSPACE;
         }
         mac_put_mblk(msg, sizeof(ptl_hdr_t));
         lastblk=msg;
@@ -284,7 +285,7 @@ kscimacnal_sendmsg(nal_cb_t        *nal,
                 if(!newblk) {
                         mac_free_msg(msg);
                         PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
-                        return -ENOMEM;
+                        return PTL_NOSPACE;
                 }
                 mac_put_mblk(newblk, nob);
                 mac_link_mblk(lastblk, newblk);
@@ -315,10 +316,10 @@ kscimacnal_sendmsg(nal_cb_t        *nal,
                 CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc);
                 mac_free_msg(msg);
                 PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
-                return rc;
+                return PTL_FAIL;
         }
 
-        return 0;
+        return PTL_OK;
 }
 
 
@@ -463,12 +464,15 @@ kscimacnal_recvmsg(nal_cb_t     *nal,
                         krx->msg, mlen, rlen, niov);
 
         /* What was actually received must be >= what sender claims to have
-         * sent.  This is an LASSERT, since lib-move doesn't check cb return
-         * code yet. Also, rlen seems to be negative when mlen==0 so don't
-         * assert on that.
-         */
-        LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen);
-        LASSERT (mlen==0 || mlen <= rlen);
+         * sent. */
+        LASSERT (mlen <= rlen); /* something is wrong if this isn't true */
+        if (mac_msg_size(krx->msg) < sizeof(ptl_hdr_t)+mlen) {
+                /* We didn't receive everything lib thinks we did */
+                CERROR("Bad message size: have %d, need %d + %d\n",
+                       mac_msg_size(krx->msg), sizeof(ptl_hdr_t), mlen);
+                return (PTL_FAIL);
+        }
+
         /* It must be OK to kmap() if required */
         LASSERT (kiov == NULL || !in_interrupt ());
         /* Either all pages or all vaddrs */
@@ -545,12 +549,12 @@ kscimacnal_recvmsg(nal_cb_t     *nal,
         CDEBUG(D_NET, "Calling lib_finalize.\n");
 
         PROF_START(lib_finalize);
-        lib_finalize(nal, private, cookie);
+        lib_finalize(nal, private, cookie, PTL_OK);
         PROF_FINISH(lib_finalize);
 
         CDEBUG(D_NET, "Done.\n");
 
-        return rlen;
+        return PTL_OK;
 }
 
 
index 9ae1c87..c47dcb4 100644 (file)
@@ -993,15 +993,11 @@ ksocknal_destroy_conn (ksock_conn_t *conn)
         /* complete current receive if any */
         switch (conn->ksnc_rx_state) {
         case SOCKNAL_RX_BODY:
-#if 0
-                lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie);
-#else
-                CERROR ("Refusing to complete a partial receive from "
-                        LPX64", ip %d.%d.%d.%d:%d\n", conn->ksnc_peer->ksnp_nid,
-                        HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
-                CERROR ("This may hang communications and "
-                        "prevent modules from unloading\n");
-#endif
+                CERROR("Completing partial receive from "LPX64
+                       ", ip %d.%d.%d.%d:%d, with error\n",
+                       conn->ksnc_peer->ksnp_nid,
+                       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+                lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL);
                 break;
         case SOCKNAL_RX_BODY_FWD:
                 ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED);
index 82d4c64..3ecead1 100644 (file)
@@ -29,7 +29,7 @@
  *  LIB functions follow
  *
  */
-int
+ptl_err_t
 ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
               user_ptr src_addr, size_t len)
 {
@@ -37,10 +37,10 @@ ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
                nal->ni.nid, (long)len, src_addr, dst_addr);
 
         memcpy( dst_addr, src_addr, len );
-        return 0;
+        return PTL_OK;
 }
 
-int
+ptl_err_t
 ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
                void *src_addr, size_t len)
 {
@@ -48,20 +48,7 @@ ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
                nal->ni.nid, (long)len, src_addr, dst_addr);
 
         memcpy( dst_addr, src_addr, len );
-        return 0;
-}
-
-int
-ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
-                         ptl_event_t *ev)
-{
-        CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
-               nal->ni.nid, eq, ev);
-
-        if (eq->event_callback != NULL)
-                eq->event_callback(ev);
-
-        return 0;
+        return PTL_OK;
 }
 
 void *
@@ -617,7 +604,8 @@ ksocknal_tx_done (ksock_tx_t *tx, int asynch)
 
         if (tx->tx_isfwd) {             /* was a forwarded packet? */
                 kpr_fwd_done (&ksocknal_data.ksnd_router,
-                              KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+                              KSOCK_TX_2_KPR_FWD_DESC (tx), 
+                              (tx->tx_resid == 0) ? 0 : -ECONNABORTED);
                 EXIT;
                 return;
         }
@@ -625,7 +613,8 @@ ksocknal_tx_done (ksock_tx_t *tx, int asynch)
         /* local send */
         ltx = KSOCK_TX_2_KSOCK_LTX (tx);
 
-        lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie);
+        lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie,
+                      (tx->tx_resid == 0) ? PTL_OK : PTL_FAIL);
 
         ksocknal_free_ltx (ltx);
         EXIT;
@@ -694,17 +683,17 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
         LASSERT (rc < 0);
 
         if (!conn->ksnc_closing)
-                CERROR ("[%p] Error %d on write to "LPX64
-                        " ip %d.%d.%d.%d:%d\n",conn, rc, 
-                        conn->ksnc_peer->ksnp_nid,
-                        HIPQUAD(conn->ksnc_ipaddr),
-                        conn->ksnc_port);
+                CERROR("[%p] Error %d on write to "LPX64
+                       " ip %d.%d.%d.%d:%d\n", conn, rc,
+                       conn->ksnc_peer->ksnp_nid,
+                       HIPQUAD(conn->ksnc_ipaddr),
+                       conn->ksnc_port);
 
         ksocknal_close_conn_and_siblings (conn, rc);
         ksocknal_tx_launched (tx);
-        
+
         return (rc);
-} 
+}
 
 void
 ksocknal_launch_autoconnect_locked (ksock_route_t *route)
@@ -742,21 +731,21 @@ ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid)
         ptl_nid_t     target_nid;
         int           rc;
         ksock_peer_t *peer = ksocknal_find_peer_locked (nid);
-        
+
         if (peer != NULL)
                 return (peer);
-        
+
         if (tx->tx_isfwd) {
                 CERROR ("Can't send packet to "LPX64
-                        " %s: routed target is not a peer\n", 
+                       " %s: routed target is not a peer\n",
                         nid, portals_nid2str(SOCKNAL, nid, ipbuf));
                 return (NULL);
         }
-        
+
         rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob,
                          &target_nid);
         if (rc != 0) {
-                CERROR ("Can't route to "LPX64" %s: router error %d\n", 
+                CERROR ("Can't route to "LPX64" %s: router error %d\n",
                         nid, portals_nid2str(SOCKNAL, nid, ipbuf), rc);
                 return (NULL);
         }
@@ -1018,7 +1007,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid)
         return (-EHOSTUNREACH);
 }
 
-int
+ptl_err_t
 ksocknal_sendmsg(nal_cb_t     *nal, 
                  void         *private, 
                  lib_msg_t    *cookie,
@@ -1029,6 +1018,7 @@ ksocknal_sendmsg(nal_cb_t     *nal,
                  unsigned int  payload_niov, 
                  struct iovec *payload_iov, 
                  ptl_kiov_t   *payload_kiov,
+                 size_t        payload_offset,
                  size_t        payload_nob)
 {
         ksock_ltx_t  *ltx;
@@ -1091,20 +1081,19 @@ ksocknal_sendmsg(nal_cb_t     *nal,
                 ltx->ltx_tx.tx_kiov  = NULL;
                 ltx->ltx_tx.tx_nkiov = 0;
 
-                ltx->ltx_tx.tx_niov = 1 + payload_niov;
-
-                memcpy(ltx->ltx_iov + 1, payload_iov,
-                       payload_niov * sizeof (*payload_iov));
-
+                ltx->ltx_tx.tx_niov = 
+                        1 + lib_extract_iov(payload_niov, &ltx->ltx_iov[1],
+                                            payload_niov, payload_iov,
+                                            payload_offset, payload_nob);
         } else {
                 /* payload is all pages */
-                ltx->ltx_tx.tx_kiov = ltx->ltx_kiov;
-                ltx->ltx_tx.tx_nkiov = payload_niov;
-
                 ltx->ltx_tx.tx_niov = 1;
 
-                memcpy(ltx->ltx_kiov, payload_kiov, 
-                       payload_niov * sizeof (*payload_kiov));
+                ltx->ltx_tx.tx_kiov = ltx->ltx_kiov;
+                ltx->ltx_tx.tx_nkiov =
+                        lib_extract_kiov(payload_niov, ltx->ltx_kiov,
+                                         payload_niov, payload_kiov,
+                                         payload_offset, payload_nob);
         }
 
         rc = ksocknal_launch_packet(&ltx->ltx_tx, nid);
@@ -1115,28 +1104,28 @@ ksocknal_sendmsg(nal_cb_t     *nal,
         return (PTL_FAIL);
 }
 
-int
+ptl_err_t
 ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie,
                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                unsigned int payload_niov, struct iovec *payload_iov,
-               size_t payload_len)
+               size_t payload_offset, size_t payload_len)
 {
         return (ksocknal_sendmsg(nal, private, cookie,
                                  hdr, type, nid, pid,
                                  payload_niov, payload_iov, NULL,
-                                 payload_len));
+                                 payload_offset, payload_len));
 }
 
-int
+ptl_err_t
 ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
-                     size_t payload_len)
+                     size_t payload_offset, size_t payload_len)
 {
         return (ksocknal_sendmsg(nal, private, cookie,
                                  hdr, type, nid, pid,
                                  payload_niov, NULL, payload_kiov,
-                                 payload_len));
+                                 payload_offset, payload_len));
 }
 
 void
@@ -1208,7 +1197,7 @@ ksocknal_fmb_callback (void *arg, int error)
 
         /* drop peer ref taken on init */
         ksocknal_put_peer (fmb->fmb_peer);
-        
+
         spin_lock_irqsave (&fmp->fmp_lock, flags);
 
         list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
@@ -1591,7 +1580,7 @@ ksocknal_process_receive (ksock_conn_t *conn)
 
         case SOCKNAL_RX_BODY:
                 /* payload all received */
-                lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie);
+                lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_OK);
                 /* Fall through */
 
         case SOCKNAL_RX_SLOP:
@@ -1627,9 +1616,10 @@ ksocknal_process_receive (ksock_conn_t *conn)
         return (-EINVAL);                       /* keep gcc happy */
 }
 
-int
+ptl_err_t
 ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
-               unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+               unsigned int niov, struct iovec *iov, 
+               size_t offset, size_t mlen, size_t rlen)
 {
         ksock_conn_t *conn = (ksock_conn_t *)private;
 
@@ -1642,20 +1632,22 @@ ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
 
         conn->ksnc_rx_nkiov = 0;
         conn->ksnc_rx_kiov = NULL;
-        conn->ksnc_rx_niov = niov;
         conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
-        memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov));
+        conn->ksnc_rx_niov =
+                lib_extract_iov(PTL_MD_MAX_IOV, conn->ksnc_rx_iov,
+                                niov, iov, offset, mlen);
 
         LASSERT (mlen == 
                  lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
                  lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
 
-        return (rlen);
+        return (PTL_OK);
 }
 
-int
+ptl_err_t
 ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
-                     unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen)
+                     unsigned int niov, ptl_kiov_t *kiov, 
+                     size_t offset, size_t mlen, size_t rlen)
 {
         ksock_conn_t *conn = (ksock_conn_t *)private;
 
@@ -1668,15 +1660,16 @@ ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
 
         conn->ksnc_rx_niov = 0;
         conn->ksnc_rx_iov  = NULL;
-        conn->ksnc_rx_nkiov = niov;
         conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
-        memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov));
+        conn->ksnc_rx_nkiov = 
+                lib_extract_kiov(PTL_MD_MAX_IOV, conn->ksnc_rx_kiov,
+                                 niov, kiov, offset, mlen);
 
         LASSERT (mlen == 
                  lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
                  lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
 
-        return (rlen);
+        return (PTL_OK);
 }
 
 int ksocknal_scheduler (void *arg)
@@ -2064,7 +2057,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati
                         rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
                 return (rc);
         }
-        
+
         if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) {
                 CERROR ("Bad magic %#08x (%#08x expected) from "LPX64" %s\n",
                         __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, *nid,
@@ -2118,7 +2111,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati
         } else if (*nid != __le64_to_cpu (hdr.src_nid)) {
                 CERROR ("Connected to nid "LPX64" %s, but expecting "LPX64" %s\n",
                         __le64_to_cpu (hdr.src_nid),
-                        portals_nid2str(SOCKNAL, 
+                        portals_nid2str(SOCKNAL,
                                         __le64_to_cpu(hdr.src_nid),
                                         ipbuf),
                         *nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
@@ -2139,7 +2132,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati
                         *type = SOCKNAL_CONN_BULK_IN;
                         break;
                 default:
-                        CERROR ("Unexpected type %d from "LPX64" %s\n", 
+                        CERROR ("Unexpected type %d from "LPX64" %s\n",
                                 *type, *nid,
                                 portals_nid2str(SOCKNAL, *nid, ipbuf));
                         return (-EPROTO);
@@ -2346,8 +2339,8 @@ ksocknal_connect_peer (ksock_route_t *route, int type)
         if (rc != 0) {
                 CERROR ("Error %d connecting to "LPX64" %s\n", rc,
                         route->ksnr_peer->ksnp_nid,
-                        portals_nid2str(SOCKNAL, 
-                                        route->ksnr_peer->ksnp_nid, 
+                        portals_nid2str(SOCKNAL,
+                                        route->ksnr_peer->ksnp_nid,
                                         ipbuf));
                 goto out;
         }
@@ -2432,7 +2425,7 @@ ksocknal_autoconnect (ksock_route_t *route)
         while (!list_empty (&zombies)) {
                 char ipbuf[PTL_NALFMT_SIZE];
                 tx = list_entry (zombies.next, ksock_tx_t, tx_list);
-                
+
                 CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n",
                         NTOH__u32 (tx->tx_hdr->type),
                         NTOH__u32 (tx->tx_hdr->payload_length),
@@ -2719,7 +2712,6 @@ nal_cb_t ksocknal_lib = {
         cb_recv_pages:   ksocknal_recv_pages,
         cb_read:         ksocknal_read,
         cb_write:        ksocknal_write,
-        cb_callback:     ksocknal_callback,
         cb_malloc:       ksocknal_malloc,
         cb_free:         ksocknal_free,
         cb_printf:       ksocknal_printf,
index 2768c8d..2f5a852 100644 (file)
@@ -812,9 +812,11 @@ EXPORT_SYMBOL(PtlMDBind);
 EXPORT_SYMBOL(lib_iov_nob);
 EXPORT_SYMBOL(lib_copy_iov2buf);
 EXPORT_SYMBOL(lib_copy_buf2iov);
+EXPORT_SYMBOL(lib_extract_iov);
 EXPORT_SYMBOL(lib_kiov_nob);
 EXPORT_SYMBOL(lib_copy_kiov2buf);
 EXPORT_SYMBOL(lib_copy_buf2kiov);
+EXPORT_SYMBOL(lib_extract_kiov);
 EXPORT_SYMBOL(lib_finalize);
 EXPORT_SYMBOL(lib_parse);
 EXPORT_SYMBOL(lib_fake_reply_msg);
index 8c03749..d17db61 100644 (file)
@@ -6,5 +6,9 @@
 
 CPPFLAGS=
 INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include
-lib_LIBRARIES= libportals.a
+noinst_LIBRARIES= libportals.a
 libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c
+
+if LIBLUSTRE
+libportals_a_CFLAGS= -fPIC
+endif
index 9bc9c36..964b9d8 100644 (file)
@@ -81,12 +81,6 @@ int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev)
 
         *ev = *new_event;
 
-        /* Set the unlinked_me interface number if there is one to pass
-         * back, since the NAL hasn't a clue what it is and therefore can't
-         * set it. */
-        if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE))
-                ev->unlinked_me.nal_idx = eventq.nal_idx;
-        
         /* ensure event is delivered correctly despite possible 
            races with lib_finalize */
         if (eq->sequence != new_event->sequence) {
@@ -119,6 +113,7 @@ int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out)
 }
 
 #ifndef __KERNEL__
+#if 0
 static jmp_buf eq_jumpbuf;
 
 static void eq_timeout(int signal)
@@ -162,6 +157,46 @@ int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
 
         return rc;
 }
+#else
+#include <errno.h>
 
-#endif
+/* FIXME
+ * Here timeout need a trick with tcpnal, definitely unclean but OK for
+ * this moment.
+ */
+
+/* global variables defined by tcpnal */
+extern int __tcpnal_eqwait_timeout_value;
+extern int __tcpnal_eqwait_timedout;
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+                      int timeout)
+{
+        int rc;
 
+        if (!timeout)
+                return PtlEQWait(eventq_in, event_out);
+
+        __tcpnal_eqwait_timeout_value = timeout;
+
+        while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) {
+                nal_t *nal = ptl_hndl2nal(&eventq_in);
+                
+                if (nal->yield)
+                        nal->yield(nal);
+
+                if (__tcpnal_eqwait_timedout) {
+                        if (__tcpnal_eqwait_timedout != ETIMEDOUT)
+                                printf("Warning: yield return error %d\n",
+                                        __tcpnal_eqwait_timedout);
+                        rc = PTL_EQ_EMPTY;
+                        break;
+                }
+        }
+
+        __tcpnal_eqwait_timeout_value = 0;
+
+        return rc;
+}
+#endif
+#endif /* __KERNEL__ */
index 026c93b..b5e7aa1 100644 (file)
@@ -50,6 +50,5 @@ const char *ptl_err_str[] = {
         "PTL_IOV_TOO_SMALL",
 
         "PTL_EQ_INUSE",
-        "PTL_MD_INUSE"
 };
 /* If you change these, you must update the number table in portals/errno.h */
index b2e069e..18eea91 100644 (file)
@@ -125,7 +125,7 @@ int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size,
                 if (ptl_interfaces[i] == nal) {
                         nal->refct++;
                         handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i;
-                        fprintf(stderr, "Returning existing NAL (%d)\n", i);
+                        CDEBUG(D_OTHER, "Returning existing NAL (%d)\n", i);
                         ptl_ni_init_mutex_exit ();
                         return PTL_OK;
                 }
index e54707f..d23a6aa 100644 (file)
@@ -32,7 +32,7 @@ static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf,
         nal_t *nal;
 
         if (!ptl_init) {
-                fprintf(stderr, "PtlGetId: Not initialized\n");
+                CERROR("Not initialized\n");
                 return PTL_NOINIT;
         }
 
@@ -262,7 +262,7 @@ static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in)
         int i;
 
         if (!ptl_init) {
-                fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n");
+                CERROR("PtlMDAttach/Bind/Update: Not initialized\n");
                 return PTL_NOINIT;
         }
 
index 0765498..d4d8860 100644 (file)
 # include <sys/time.h>
 #endif
 
-#ifndef PTL_USE_DESC_LISTS
-static int ptl_slab_users;
-
-atomic_t md_in_use_count = ATOMIC_INIT(0);
-atomic_t msg_in_use_count = ATOMIC_INIT(0);
-atomic_t me_in_use_count = ATOMIC_INIT(0);
-atomic_t eq_in_use_count = ATOMIC_INIT(0);
+#ifndef PTL_USE_LIB_FREELIST
 
 int
 kportal_descriptor_setup (nal_cb_t *nal)
 {
-        ptl_slab_users++;
-        RETURN(PTL_OK);
+        return PTL_OK;
 }
 
 void
 kportal_descriptor_cleanup (nal_cb_t *nal)
 {
-        if (--ptl_slab_users != 0)
-                return;
-
-        LASSERT (atomic_read (&md_in_use_count) == 0);
-        LASSERT (atomic_read (&me_in_use_count) == 0);
-        LASSERT (atomic_read (&eq_in_use_count) == 0);
-        LASSERT (atomic_read (&msg_in_use_count) == 0);
 }
 #else
 
index be6949c..a1ed583 100644 (file)
@@ -83,7 +83,7 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private,
         int           rc;
         int           i;
 
-        /* NB we are passes an allocated, but uninitialised/active md.
+        /* NB we are passed an allocated, but uninitialised/active md.
          * if we return success, caller may lib_md_unlink() it.
          * otherwise caller may only lib_md_free() it.
          */
@@ -94,9 +94,10 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private,
                         return PTL_INV_EQ;
         }
 
-        if ((md->options & PTL_MD_IOV) != 0 &&  /* discontiguous MD */
-            md->niov > PTL_MD_MAX_IOV)          /* too many fragments */
-                return PTL_IOV_TOO_MANY;
+        /* Must check this _before_ allocation.  Also, note that non-iov
+         * MDs must set md_niov to 0. */
+        LASSERT((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0 ||
+                md->niov <= PTL_MD_MAX_IOV);
 
         if ((md->options & max_size_opts) != 0 && /* max size used */
             (md->max_size < 0 || md->max_size > md->length)) // illegal max_size
@@ -239,7 +240,11 @@ int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
         lib_md_t *md;
         unsigned long flags;
 
-        md = lib_md_alloc (nal);
+        if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 &&
+            args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */
+                return (ret->rc = PTL_IOV_TOO_MANY);
+
+        md = lib_md_alloc(nal, &args->md_in);
         if (md == NULL)
                 return (ret->rc = PTL_NOSPACE);
 
@@ -287,7 +292,11 @@ int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
         lib_md_t *md;
         unsigned long flags;
 
-        md = lib_md_alloc (nal);
+        if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 &&
+            args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */
+                return (ret->rc = PTL_IOV_TOO_MANY);
+
+        md = lib_md_alloc(nal, &args->md_in);
         if (md == NULL)
                 return (ret->rc = PTL_NOSPACE);
 
@@ -311,34 +320,43 @@ int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
 
 int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
 {
-        PtlMDUnlink_in *args = v_args;
+        PtlMDUnlink_in  *args = v_args;
         PtlMDUnlink_out *ret = v_ret;
-
-        lib_md_t *md;
-        unsigned long flags;
+        ptl_event_t      ev;
+        lib_md_t        *md;
+        unsigned long    flags;
 
         state_lock(nal, &flags);
 
         md = ptl_handle2md(&args->md_in, nal);
         if (md == NULL) {
-                ret->rc = PTL_INV_MD;
-        } else if (md->pending != 0) {           /* being filled/spilled */
-                ret->rc = PTL_MD_INUSE;
-        } else {
-                /* Callers attempting to unlink a busy MD which will get
-                 * unlinked once the net op completes should see INUSE,
-                 * before completion and INV_MD thereafter.  LASSERT we've
-                 * got that right... */
-                LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0);
-
-                lib_md_deconstruct(nal, md, &ret->status_out);
-                lib_md_unlink(nal, md);
-                ret->rc = PTL_OK;
+                state_unlock(nal, &flags);
+                return (ret->rc = PTL_INV_MD);
+        }
+
+        /* If the MD is busy, lib_md_unlink just marks it for deletion, and
+         * when the NAL is done, the completion event flags that the MD was
+         * unlinked.  Otherwise, we enqueue an event now... */
+
+        if (md->eq != NULL &&
+            md->pending == 0) {
+                memset(&ev, 0, sizeof(ev));
+
+                ev.type = PTL_EVENT_UNLINK;
+                ev.status = PTL_OK;
+                ev.unlinked = 1;
+                lib_md_deconstruct(nal, md, &ev.mem_desc);
+                
+                lib_enq_event_locked(nal, private, md->eq, &ev);
         }
 
+        lib_md_deconstruct(nal, md, &ret->status_out);
+        lib_md_unlink(nal, md);
+        ret->rc = PTL_OK;
+
         state_unlock(nal, &flags);
 
-        return (ret->rc);
+        return (PTL_OK);
 }
 
 int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
@@ -379,6 +397,23 @@ int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
                 goto out;
         }
 
+        /* XXX fttb, the new MD must be the same type wrt fragmentation */
+        if (((new->options ^ md->options) & 
+             (PTL_MD_IOV | PTL_MD_KIOV)) != 0) {
+                ret->rc = PTL_INV_MD;
+                goto out;
+        }
+
+        if (new->niov > md->md_niov) {
+                ret->rc = PTL_IOV_TOO_MANY;
+                goto out;
+        } 
+
+        if (new->niov < md->md_niov) {
+                ret->rc = PTL_IOV_TOO_SMALL;
+                goto out;
+        }
+
         if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) {
                 test_eq = ptl_handle2eq(&args->testq_in, nal);
                 if (test_eq == NULL) {
index d844a7a..ecd543c 100644 (file)
@@ -258,55 +258,78 @@ lib_iov_nob (int niov, struct iovec *iov)
 }
 
 void
-lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len)
+lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, 
+                  ptl_size_t offset, ptl_size_t len)
 {
         ptl_size_t nob;
 
-        while (len > 0)
-        {
+        if (len == 0)
+                return;
+        
+        /* skip complete frags before 'offset' */
+        LASSERT (niov > 0);
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                iov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+                
+        do {
                 LASSERT (niov > 0);
-                nob = MIN (iov->iov_len, len);
-                memcpy (dest, iov->iov_base, nob);
+                nob = MIN (iov->iov_len - offset, len);
+                memcpy (dest, iov->iov_base + offset, nob);
 
                 len -= nob;
                 dest += nob;
                 niov--;
                 iov++;
-        }
+                offset = 0;
+        } while (len > 0);
 }
 
 void
-lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len)
+lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, 
+                  char *src, ptl_size_t len)
 {
         ptl_size_t nob;
 
-        while (len > 0)
-        {
+        if (len == 0)
+                return;
+
+        /* skip complete frags before 'offset' */
+        LASSERT (niov > 0);
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                iov++;
+                niov--;
                 LASSERT (niov > 0);
-                nob = MIN (iov->iov_len, len);
-                memcpy (iov->iov_base, src, nob);
+        }
+        
+        do {
+                LASSERT (niov > 0);
+                nob = MIN (iov->iov_len - offset, len);
+                memcpy (iov->iov_base + offset, src, nob);
                 
                 len -= nob;
                 src += nob;
                 niov--;
                 iov++;
-        }
+                offset = 0;
+        } while (len > 0);
 }
 
-static int
-lib_extract_iov (struct iovec *dst, lib_md_t *md,
+int
+lib_extract_iov (int dst_niov, struct iovec *dst,
+                 int src_niov, struct iovec *src,
                  ptl_size_t offset, ptl_size_t len)
 {
         /* Initialise 'dst' to the subset of 'src' starting at 'offset',
          * for exactly 'len' bytes, and return the number of entries.
          * NB not destructive to 'src' */
-        int             src_niov = md->md_niov;  
-        struct iovec   *src = md->md_iov.iov;
         ptl_size_t      frag_len;
-        int             dst_niov;
+        int             niov;
 
-        LASSERT (offset + len <= md->length);
-        
         if (len == 0)                           /* no data => */
                 return (0);                     /* no frags */
 
@@ -318,17 +341,17 @@ lib_extract_iov (struct iovec *dst, lib_md_t *md,
                 LASSERT (src_niov > 0);
         }
 
-        dst_niov = 1;
+        niov = 1;
         for (;;) {
                 LASSERT (src_niov > 0);
-                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                LASSERT (niov <= dst_niov);
                 
                 frag_len = src->iov_len - offset;
                 dst->iov_base = ((char *)src->iov_base) + offset;
 
                 if (len <= frag_len) {
                         dst->iov_len = len;
-                        return (dst_niov);
+                        return (niov);
                 }
                 
                 dst->iov_len = frag_len;
@@ -336,7 +359,7 @@ lib_extract_iov (struct iovec *dst, lib_md_t *md,
                 len -= frag_len;
                 dst++;
                 src++;
-                dst_niov++;
+                niov++;
                 src_niov--;
                 offset = 0;
         }
@@ -351,19 +374,22 @@ lib_kiov_nob (int niov, ptl_kiov_t *kiov)
 }
 
 void
-lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, 
+                   ptl_size_t offset, ptl_size_t len)
 {
         LASSERT (0);
 }
 
 void
-lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len)
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+                   char *src, ptl_size_t len)
 {
         LASSERT (0);
 }
 
-static int
-lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+int
+lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, 
+                  int src_niov, ptl_kiov_t *src,
                   ptl_size_t offset, ptl_size_t len)
 {
         LASSERT (0);
@@ -383,18 +409,30 @@ lib_kiov_nob (int niov, ptl_kiov_t *kiov)
 }
 
 void
-lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, 
+                   ptl_size_t offset, ptl_size_t len)
 {
         ptl_size_t  nob;
         char       *addr;
+
+        if (len == 0)
+                return;
         
         LASSERT (!in_interrupt ());
-        while (len > 0)
-        {
+
+        LASSERT (niov > 0);
+        while (offset > kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                kiov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+        
+        do{
                 LASSERT (niov > 0);
-                nob = MIN (kiov->kiov_len, len);
+                nob = MIN (kiov->kiov_len - offset, len);
                 
-                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
                 memcpy (dest, addr, nob);
                 kunmap (kiov->kiov_page);
                 
@@ -402,22 +440,35 @@ lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
                 dest += nob;
                 niov--;
                 kiov++;
-        }
+                offset = 0;
+        } while (len > 0);
 }
 
 void
-lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len)
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+                   char *src, ptl_size_t len)
 {
         ptl_size_t  nob;
         char       *addr;
 
+        if (len == 0)
+                return;
+
         LASSERT (!in_interrupt ());
-        while (len > 0)
-        {
+
+        LASSERT (niov > 0);
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                kiov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+        
+        do {
                 LASSERT (niov > 0);
-                nob = MIN (kiov->kiov_len, len);
+                nob = MIN (kiov->kiov_len - offset, len);
                 
-                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
                 memcpy (addr, src, nob);
                 kunmap (kiov->kiov_page);
                 
@@ -425,23 +476,21 @@ lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len)
                 src += nob;
                 niov--;
                 kiov++;
-        }
+                offset = 0;
+        } while (len > 0);
 }
 
-static int
-lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+int
+lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, 
+                  int src_niov, ptl_kiov_t *src,
                   ptl_size_t offset, ptl_size_t len)
 {
         /* Initialise 'dst' to the subset of 'src' starting at 'offset',
          * for exactly 'len' bytes, and return the number of entries.
          * NB not destructive to 'src' */
-        int             src_niov = md->md_niov;  
-        ptl_kiov_t     *src = md->md_iov.kiov;
         ptl_size_t      frag_len;
-        int             dst_niov;
+        int             niov;
 
-        LASSERT (offset + len <= md->length);
-        
         if (len == 0)                           /* no data => */
                 return (0);                     /* no frags */
 
@@ -453,10 +502,10 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
                 LASSERT (src_niov > 0);
         }
 
-        dst_niov = 1;
+        niov = 1;
         for (;;) {
                 LASSERT (src_niov > 0);
-                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                LASSERT (niov <= dst_niov);
                 
                 frag_len = src->kiov_len - offset;
                 dst->kiov_page = src->kiov_page;
@@ -465,7 +514,7 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
                 if (len <= frag_len) {
                         dst->kiov_len = len;
                         LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
-                        return (dst_niov);
+                        return (niov);
                 }
 
                 dst->kiov_len = frag_len;
@@ -474,73 +523,66 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
                 len -= frag_len;
                 dst++;
                 src++;
-                dst_niov++;
+                niov++;
                 src_niov--;
                 offset = 0;
         }
 }
 #endif
 
-void
+ptl_err_t
 lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
           ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
 {
-        int   niov;
-
         if (mlen == 0)
-                nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen);
-        else if ((md->options & PTL_MD_KIOV) == 0) {
-                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen);
-                nal->cb_recv (nal, private, msg,
-                              niov, msg->msg_iov.iov, mlen, rlen);
-        } else {
-                niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen);
-                nal->cb_recv_pages (nal, private, msg, 
-                                    niov, msg->msg_iov.kiov, mlen, rlen);
-        }
+                return (nal->cb_recv(nal, private, msg,
+                                     0, NULL,
+                                     offset, mlen, rlen));
+
+        if ((md->options & PTL_MD_KIOV) == 0)
+                return (nal->cb_recv(nal, private, msg,
+                                     md->md_niov, md->md_iov.iov, 
+                                     offset, mlen, rlen));
+
+        return (nal->cb_recv_pages(nal, private, msg, 
+                                   md->md_niov, md->md_iov.kiov,
+                                   offset, mlen, rlen));
 }
 
-int
+ptl_err_t
 lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
           ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
           lib_md_t *md, ptl_size_t offset, ptl_size_t len) 
 {
-        int   niov;
-
         if (len == 0)
-                return (nal->cb_send (nal, private, msg, 
-                                      hdr, type, nid, pid,
-                                      0, NULL, 0));
+                return (nal->cb_send(nal, private, msg,
+                                     hdr, type, nid, pid,
+                                     0, NULL,
+                                     offset, len));
         
-        if ((md->options & PTL_MD_KIOV) == 0) {
-                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len);
-                return (nal->cb_send (nal, private, msg, 
-                                      hdr, type, nid, pid,
-                                      niov, msg->msg_iov.iov, len));
-        }
-
-        niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len);
-        return (nal->cb_send_pages (nal, private, msg, 
-                                    hdr, type, nid, pid,
-                                    niov, msg->msg_iov.kiov, len));
+        if ((md->options & PTL_MD_KIOV) == 0)
+                return (nal->cb_send(nal, private, msg, 
+                                     hdr, type, nid, pid,
+                                     md->md_niov, md->md_iov.iov,
+                                     offset, len));
+
+        return (nal->cb_send_pages(nal, private, msg, 
+                                   hdr, type, nid, pid,
+                                   md->md_niov, md->md_iov.kiov,
+                                   offset, len));
 }
 
-static lib_msg_t *
-get_new_msg (nal_cb_t *nal, lib_md_t *md)
+static void
+lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg)
 {
         /* ALWAYS called holding the state_lock */
         lib_counters_t *counters = &nal->ni.counters;
-        lib_msg_t      *msg      = lib_msg_alloc (nal);
-
-        if (msg == NULL)
-                return (NULL);
-
-        memset (msg, 0, sizeof (*msg));
-
-        msg->send_ack = 0;
 
+        /* Here, we commit the MD to a network OP by marking it busy and
+         * decrementing its threshold.  Come what may, the network "owns"
+         * the MD until a call to lib_finalize() signals completion. */
         msg->md = md;
-        do_gettimeofday(&msg->ev.arrival_time);
+         
         md->pending++;
         if (md->threshold != PTL_MD_THRESH_INF) {
                 LASSERT (md->threshold > 0);
@@ -552,8 +594,24 @@ get_new_msg (nal_cb_t *nal, lib_md_t *md)
                 counters->msgs_max = counters->msgs_alloc;
 
         list_add (&msg->msg_list, &nal->ni.ni_active_msgs);
+}
 
-        return (msg);
+static void
+lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr)
+{
+        unsigned long flags;
+
+        /* CAVEAT EMPTOR: this only drops messages that we've not committed
+         * to receive (init_msg() not called) and therefore can't cause an
+         * event. */
+        
+        state_lock(nal, &flags);
+        nal->ni.counters.drop_count++;
+        nal->ni.counters.drop_length += hdr->payload_length;
+        state_unlock(nal, &flags);
+
+        /* NULL msg => if NAL calls lib_finalize it will be a noop */
+        (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
 }
 
 /*
@@ -563,17 +621,18 @@ get_new_msg (nal_cb_t *nal, lib_md_t *md)
  * of long messages.
  *
  */
-static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
         lib_ni_t        *ni = &nal->ni;
         ptl_size_t       mlength = 0;
         ptl_size_t       offset = 0;
         int              unlink = 0;
+        ptl_err_t        rc;
         lib_me_t        *me;
         lib_md_t        *md;
-        lib_msg_t       *msg;
         unsigned long    flags;
-
+                
         /* Convert put fields to host byte order */
         hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits);
         hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index);
@@ -586,8 +645,10 @@ static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                          hdr->payload_length, hdr->msg.put.offset,
                          hdr->msg.put.match_bits,
                          &mlength, &offset, &unlink);
-        if (me == NULL)
-                goto drop;
+        if (me == NULL) {
+                state_unlock(nal, &flags);
+                return (PTL_FAIL);
+        }
 
         md = me->md;
         CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d "
@@ -595,69 +656,46 @@ static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, 
                md->md_lh.lh_cookie, md->md_niov, offset);
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n",
-                       ni->nid, hdr->src_nid);
-                goto drop;
-        }
+        lib_commit_md(nal, md, msg);
+
+        msg->ev.type = PTL_EVENT_PUT;
+        msg->ev.initiator.nid = hdr->src_nid;
+        msg->ev.initiator.pid = hdr->src_pid;
+        msg->ev.portal = hdr->msg.put.ptl_index;
+        msg->ev.match_bits = hdr->msg.put.match_bits;
+        msg->ev.rlength = hdr->payload_length;
+        msg->ev.mlength = mlength;
+        msg->ev.offset = offset;
+        msg->ev.hdr_data = hdr->msg.put.hdr_data;
+
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
 
         if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
             !(md->options & PTL_MD_ACK_DISABLE)) {
-                msg->send_ack = 1;
                 msg->ack_wmd = hdr->msg.put.ack_wmd;
-                msg->nid = hdr->src_nid;
-                msg->pid = hdr->src_pid;
-                msg->ev.match_bits = hdr->msg.put.match_bits;
-        }
-
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_PUT;
-                msg->ev.initiator.nid = hdr->src_nid;
-                msg->ev.initiator.pid = hdr->src_pid;
-                msg->ev.portal = hdr->msg.put.ptl_index;
-                msg->ev.match_bits = hdr->msg.put.match_bits;
-                msg->ev.rlength = hdr->payload_length;
-                msg->ev.mlength = mlength;
-                msg->ev.offset = offset;
-                msg->ev.hdr_data = hdr->msg.put.hdr_data;
-
-                /* NB if this match has exhausted the MD, we can't be sure
-                 * that this event will the the last one associated with
-                 * this MD in the event queue (another message already
-                 * matching this ME/MD could end up being last).  So we
-                 * remember the ME handle anyway and check again when we're
-                 * allocating our slot in the event queue.
-                 */
-                ptl_me2handle (&msg->ev.unlinked_me, me);
-
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
         }
 
         ni->counters.recv_count++;
         ni->counters.recv_length += mlength;
 
-        /* only unlink after MD's pending count has been bumped
-         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
-        if (unlink) {
-                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+        /* only unlink after MD's pending count has been bumped in
+         * lib_commit_md() otherwise lib_me_unlink() will nuke it */
+        if (unlink)
                 lib_me_unlink (nal, me);
-        }
 
         state_unlock(nal, &flags);
 
-        lib_recv (nal, private, msg, md, offset, mlength, hdr->payload_length);
-        return 0;
+        rc = lib_recv(nal, private, msg, md, offset, mlength,
+                      hdr->payload_length);
+        if (rc != PTL_OK)
+                CERROR(LPU64": error on receiving PUT from "LPU64": %d\n",
+                       ni->nid, hdr->src_nid, rc);
 
- drop:
-        nal->ni.counters.drop_count++;
-        nal->ni.counters.drop_length += hdr->payload_length;
-        state_unlock (nal, &flags);
-        lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-        return -1;
+        return (rc);
 }
 
-static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
         lib_ni_t        *ni = &nal->ni;
         ptl_size_t       mlength = 0;
@@ -665,7 +703,6 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
         int              unlink = 0;
         lib_me_t        *me;
         lib_md_t        *md;
-        lib_msg_t       *msg;
         ptl_hdr_t        reply;
         unsigned long    flags;
         int              rc;
@@ -683,8 +720,10 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                          hdr->msg.get.sink_length, hdr->msg.get.src_offset,
                          hdr->msg.get.match_bits,
                          &mlength, &offset, &unlink);
-        if (me == NULL)
-                goto drop;
+        if (me == NULL) {
+                state_unlock(nal, &flags);
+                return (PTL_FAIL);
+        }
 
         md = me->md;
         CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d "
@@ -692,45 +731,27 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, 
                md->md_lh.lh_cookie, md->md_niov, offset);
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n",
-                       ni->nid, hdr->src_nid);
-                goto drop;
-        }
+        lib_commit_md(nal, md, msg);
 
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_GET;
-                msg->ev.initiator.nid = hdr->src_nid;
-                msg->ev.initiator.pid = hdr->src_pid;
-                msg->ev.portal = hdr->msg.get.ptl_index;
-                msg->ev.match_bits = hdr->msg.get.match_bits;
-                msg->ev.rlength = hdr->payload_length;
-                msg->ev.mlength = mlength;
-                msg->ev.offset = offset;
-                msg->ev.hdr_data = 0;
-
-                /* NB if this match has exhausted the MD, we can't be sure
-                 * that this event will the the last one associated with
-                 * this MD in the event queue (another message already
-                 * matching this ME/MD could end up being last).  So we
-                 * remember the ME handle anyway and check again when we're
-                 * allocating our slot in the event queue.
-                 */
-                ptl_me2handle (&msg->ev.unlinked_me, me);
-
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-        }
+        msg->ev.type = PTL_EVENT_GET;
+        msg->ev.initiator.nid = hdr->src_nid;
+        msg->ev.initiator.pid = hdr->src_pid;
+        msg->ev.portal = hdr->msg.get.ptl_index;
+        msg->ev.match_bits = hdr->msg.get.match_bits;
+        msg->ev.rlength = hdr->payload_length;
+        msg->ev.mlength = mlength;
+        msg->ev.offset = offset;
+        msg->ev.hdr_data = 0;
+
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
 
         ni->counters.send_count++;
         ni->counters.send_length += mlength;
 
-        /* only unlink after MD's refcount has been bumped
-         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
-        if (unlink) {
-                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+        /* only unlink after MD's refcount has been bumped in
+         * lib_commit_md() otherwise lib_me_unlink() will nuke it */
+        if (unlink)
                 lib_me_unlink (nal, me);
-        }
 
         state_unlock(nal, &flags);
 
@@ -749,36 +770,25 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
 
         rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, 
                        hdr->src_nid, hdr->src_pid, md, offset, mlength);
-        if (rc != PTL_OK) {
-                CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n",
-                       ni->nid, hdr->src_nid);
-                /* Hmm, this will create a GET event and make believe
-                 * the reply completed, which it kind of did, only the
-                 * source won't get her reply */
-                lib_finalize (nal, private, msg);
-                state_lock (nal, &flags);
-                goto drop;
-        }
+        if (rc != PTL_OK)
+                CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n",
+                       ni->nid, hdr->src_nid, rc);
+
+        /* Discard any junk after the hdr */
+        (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
 
-        /* Complete the incoming message */
-        lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
         return (rc);
- drop:
-        ni->counters.drop_count++;
-        ni->counters.drop_length += hdr->msg.get.sink_length;
-        state_unlock(nal, &flags);
-        lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-        return -1;
 }
 
-static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
         lib_ni_t        *ni = &nal->ni;
         lib_md_t        *md;
         int              rlength;
         int              length;
-        lib_msg_t       *msg;
         unsigned long    flags;
+        ptl_err_t        rc;
 
         state_lock(nal, &flags);
 
@@ -790,7 +800,9 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                         md == NULL ? "invalid" : "inactive",
                         hdr->msg.reply.dst_wmd.wh_interface_cookie,
                         hdr->msg.reply.dst_wmd.wh_object_cookie);
-                goto drop;
+
+                state_unlock(nal, &flags);
+                return (PTL_FAIL);
         }
 
         LASSERT (md->offset == 0);
@@ -804,7 +816,8 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                                 ni->nid, hdr->src_nid, length,
                                 hdr->msg.reply.dst_wmd.wh_object_cookie,
                                 md->length);
-                        goto drop;
+                        state_unlock(nal, &flags);
+                        return (PTL_FAIL);
                 }
                 length = md->length;
         }
@@ -813,46 +826,36 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                hdr->src_nid, length, rlength, 
                hdr->msg.reply.dst_wmd.wh_object_cookie);
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR(LPU64": Dropping REPLY from "LPU64": can't "
-                       "allocate msg\n", ni->nid, hdr->src_nid);
-                goto drop;
-        }
+        lib_commit_md(nal, md, msg);
 
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_REPLY;
-                msg->ev.initiator.nid = hdr->src_nid;
-                msg->ev.initiator.pid = hdr->src_pid;
-                msg->ev.rlength = rlength;
-                msg->ev.mlength = length;
-                msg->ev.offset = 0;
+        msg->ev.type = PTL_EVENT_REPLY;
+        msg->ev.initiator.nid = hdr->src_nid;
+        msg->ev.initiator.pid = hdr->src_pid;
+        msg->ev.rlength = rlength;
+        msg->ev.mlength = length;
+        msg->ev.offset = 0;
 
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-        }
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
 
         ni->counters.recv_count++;
         ni->counters.recv_length += length;
 
         state_unlock(nal, &flags);
 
-        lib_recv (nal, private, msg, md, 0, length, rlength);
-        return 0;
+        rc = lib_recv(nal, private, msg, md, 0, length, rlength);
+        if (rc != PTL_OK)
+                CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n",
+                       ni->nid, hdr->src_nid, rc);
 
- drop:
-        nal->ni.counters.drop_count++;
-        nal->ni.counters.drop_length += hdr->payload_length;
-        state_unlock (nal, &flags);
-        lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-        return -1;
+        return (rc);
 }
 
-static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
-        lib_ni_t *ni = &nal->ni;
-        lib_md_t *md;
-        lib_msg_t *msg = NULL;
-        unsigned long flags;
+        lib_ni_t      *ni = &nal->ni;
+        lib_md_t      *md;
+        unsigned long  flags;
 
         /* Convert ack fields to host byte order */
         hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits);
@@ -868,40 +871,37 @@ static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                        (md == NULL) ? "invalid" : "inactive",
                        hdr->msg.ack.dst_wmd.wh_interface_cookie,
                        hdr->msg.ack.dst_wmd.wh_object_cookie);
-                goto drop;
+
+                state_unlock(nal, &flags);
+                return (PTL_FAIL);
         }
 
         CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n",
                ni->nid, hdr->src_nid, 
                hdr->msg.ack.dst_wmd.wh_object_cookie);
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n",
-                       ni->nid, hdr->src_nid);
-                goto drop;
-        }
+        lib_commit_md(nal, md, msg);
 
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_ACK;
-                msg->ev.initiator.nid = hdr->src_nid;
-                msg->ev.initiator.pid = hdr->src_pid;
-                msg->ev.mlength = hdr->msg.ack.mlength;
-                msg->ev.match_bits = hdr->msg.ack.match_bits;
+        msg->ev.type = PTL_EVENT_ACK;
+        msg->ev.initiator.nid = hdr->src_nid;
+        msg->ev.initiator.pid = hdr->src_pid;
+        msg->ev.mlength = hdr->msg.ack.mlength;
+        msg->ev.match_bits = hdr->msg.ack.match_bits;
 
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-        }
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
 
         ni->counters.recv_count++;
-        state_unlock(nal, &flags);
-        lib_recv (nal, private, msg, NULL, 0, 0, hdr->payload_length);
-        return 0;
 
- drop:
-        nal->ni.counters.drop_count++;
-        state_unlock (nal, &flags);
-        lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-        return -1;
+        state_unlock(nal, &flags);
+        
+        /* We have received and matched up the ack OK, create the
+         * completion event now... */
+        lib_finalize(nal, private, msg, PTL_OK);
+
+        /* ...and now discard any junk after the hdr */
+        (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
+       return (PTL_OK);
 }
 
 static char *
@@ -983,10 +983,13 @@ void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr)
 }                               /* end of print_hdr() */
 
 
-int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+void 
+lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private)
 {
         unsigned long  flags;
-
+        ptl_err_t      rc;
+        lib_msg_t     *msg;
+        
         /* convert common fields to host byte order */
         hdr->dest_nid = NTOH__u64 (hdr->dest_nid);
         hdr->src_nid = NTOH__u64 (hdr->src_nid);
@@ -1008,22 +1011,16 @@ int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                         nal->ni.nid, mv->magic, 
                         mv->version_major, mv->version_minor,
                         hdr->src_nid);
-                lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-                return (-1);
+                lib_drop_message(nal, private, hdr);
+                return;
         }
         
         if (hdr->dest_nid != nal->ni.nid) {
                 CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64
                        " (not me)\n", nal->ni.nid, hdr_type_string (hdr),
                        hdr->src_nid, hdr->dest_nid);
-
-                state_lock (nal, &flags);
-                nal->ni.counters.drop_count++;
-                nal->ni.counters.drop_length += hdr->payload_length;
-                state_unlock (nal, &flags);
-
-                lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-                return (-1);
+                lib_drop_message(nal, private, hdr);
+                return;
         }
 
         if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
@@ -1033,34 +1030,59 @@ int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                        ": simulated failure\n",
                        nal->ni.nid, hdr_type_string (hdr), 
                        hdr->src_nid);
-                lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-                return (-1);
+                lib_drop_message(nal, private, hdr);
+                return;
         }
-        
+
+        msg = lib_msg_alloc(nal);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping incoming %s from "LPU64
+                       ": can't allocate a lib_msg_t\n",
+                       nal->ni.nid, hdr_type_string (hdr), 
+                       hdr->src_nid);
+                lib_drop_message(nal, private, hdr);
+                return;
+        }
+
+        do_gettimeofday(&msg->ev.arrival_time);
+
         switch (hdr->type) {
         case PTL_MSG_ACK:
-                return (parse_ack(nal, hdr, private));
+                rc = parse_ack(nal, hdr, private, msg);
+                break;
         case PTL_MSG_PUT:
-                return (parse_put(nal, hdr, private));
+                rc = parse_put(nal, hdr, private, msg);
                 break;
         case PTL_MSG_GET:
-                return (parse_get(nal, hdr, private));
+                rc = parse_get(nal, hdr, private, msg);
                 break;
         case PTL_MSG_REPLY:
-                return (parse_reply(nal, hdr, private));
+                rc = parse_reply(nal, hdr, private, msg);
                 break;
         default:
                 CERROR(LPU64": Dropping <unknown> message from "LPU64
                        ": Bad type=0x%x\n",  nal->ni.nid, hdr->src_nid,
                        hdr->type);
-
-                lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-                return (-1);
+                rc = PTL_FAIL;
+                break;
+        }
+                
+        if (rc != PTL_OK) {
+                if (msg->md != NULL) {
+                        /* committed... */
+                        lib_finalize(nal, private, msg, rc);
+                } else {
+                        state_lock(nal, &flags);
+                        lib_msg_free(nal, msg); /* expects state_lock held */
+                        state_unlock(nal, &flags);
+
+                        lib_drop_message(nal, private, hdr);
+                }
         }
 }
 
-
-int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int 
+do_PtlPut(nal_cb_t *nal, void *private, void *v_args, void *v_ret)
 {
         /*
          * Incoming:
@@ -1075,16 +1097,15 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
          * Outgoing:
          */
 
-        PtlPut_in *args = v_args;
-        PtlPut_out *ret = v_ret;
-        ptl_hdr_t hdr;
-
-        lib_ni_t *ni = &nal->ni;
-        lib_md_t *md;
-        lib_msg_t *msg = NULL;
+        PtlPut_in        *args = v_args;
         ptl_process_id_t *id = &args->target_in;
-        unsigned long flags;
-        int           rc;
+        PtlPut_out       *ret = v_ret;
+        lib_ni_t         *ni = &nal->ni;
+        lib_msg_t        *msg;
+        ptl_hdr_t         hdr;
+        lib_md_t         *md;
+        unsigned long     flags;
+        int               rc;
         
         if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
             fail_peer (nal, id->nid, 1))           /* shall we now? */
@@ -1093,13 +1114,22 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
                        nal->ni.nid, id->nid);
                 return (ret->rc = PTL_INV_PROC);
         }
-        
-        ret->rc = PTL_OK;
+
+        msg = lib_msg_alloc(nal);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n",
+                       ni->nid, id->nid);
+                return (ret->rc = PTL_NOSPACE);
+        }
+
         state_lock(nal, &flags);
+
         md = ptl_handle2md(&args->md_in, nal);
-        if (md == NULL || !md->threshold) {
+        if (md == NULL || md->threshold == 0) {
+                lib_msg_free(nal, msg);
                 state_unlock(nal, &flags);
-                return ret->rc = PTL_INV_MD;
+        
+                return (ret->rc = PTL_INV_MD);
         }
 
         CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid,
@@ -1126,57 +1156,39 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
         hdr.msg.put.offset = HTON__u32 (args->offset_in);
         hdr.msg.put.hdr_data = args->hdr_data_in;
 
+        lib_commit_md(nal, md, msg);
+        
+        msg->ev.type = PTL_EVENT_SENT;
+        msg->ev.initiator.nid = ni->nid;
+        msg->ev.initiator.pid = ni->pid;
+        msg->ev.portal = args->portal_in;
+        msg->ev.match_bits = args->match_bits_in;
+        msg->ev.rlength = md->length;
+        msg->ev.mlength = md->length;
+        msg->ev.offset = args->offset_in;
+        msg->ev.hdr_data = args->hdr_data_in;
+
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+
         ni->counters.send_count++;
         ni->counters.send_length += md->length;
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR("BAD: could not allocate msg!\n");
-                state_unlock(nal, &flags);
-                return ret->rc = PTL_NOSPACE;
-        }
-
-        /*
-         * If this memory descriptor has an event queue associated with
-         * it we need to allocate a message state object and record the
-         * information about this operation that will be recorded into
-         * event queue once the message has been completed.
-         *
-         * NB. We're now committed to the GET, since we just marked the MD
-         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
-         * PtlMDUnlink()) expect a completion event to tell them when the
-         * MD becomes idle. 
-         */
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_SENT;
-                msg->ev.initiator.nid = ni->nid;
-                msg->ev.initiator.pid = ni->pid;
-                msg->ev.portal = args->portal_in;
-                msg->ev.match_bits = args->match_bits_in;
-                msg->ev.rlength = md->length;
-                msg->ev.mlength = md->length;
-                msg->ev.offset = args->offset_in;
-                msg->ev.hdr_data = args->hdr_data_in;
-
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-        }
-
         state_unlock(nal, &flags);
         
         rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT,
                        id->nid, id->pid, md, 0, md->length);
         if (rc != PTL_OK) {
-                /* get_new_msg() committed us to sending by decrementing
-                 * md->threshold, so we have to act like we did send, but
-                 * the network dropped it. */
-                lib_finalize (nal, private, msg);
+                CERROR(LPU64": error sending PUT to "LPU64": %d\n",
+                       ni->nid, id->nid, rc);
+                lib_finalize (nal, private, msg, rc);
         }
         
+        /* completion will be signalled by an event */
         return ret->rc = PTL_OK;
 }
 
-lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, 
-                                lib_md_t *getmd)
+lib_msg_t * 
+lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd)
 {
         /* The NAL can DMA direct to the GET md (i.e. no REPLY msg).  This
          * returns a msg the NAL can pass to lib_finalize() so that a REPLY
@@ -1188,39 +1200,38 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
          * lib_finalize() of the original GET. */
 
         lib_ni_t        *ni = &nal->ni;
-        lib_msg_t       *msg;
+        lib_msg_t       *msg = lib_msg_alloc(nal);
         unsigned long    flags;
 
         state_lock(nal, &flags);
 
         LASSERT (getmd->pending > 0);
 
+        if (msg == NULL) {
+                CERROR ("Dropping REPLY from "LPU64": can't allocate msg\n",
+                        peer_nid);
+                goto drop;
+        }
+
         if (getmd->threshold == 0) {
                 CERROR ("Dropping REPLY from "LPU64" for inactive MD %p\n",
                         peer_nid, getmd);
-                goto drop;
+                goto drop_msg;
         }
 
         LASSERT (getmd->offset == 0);
 
         CDEBUG(D_NET, "Reply from "LPU64" md %p\n", peer_nid, getmd);
 
-        msg = get_new_msg (nal, getmd);
-        if (msg == NULL) {
-                CERROR("Dropping REPLY from "LPU64" md %p: can't allocate msg\n", 
-                       peer_nid, getmd);
-                goto drop;
-        }
+        lib_commit_md (nal, getmd, msg);
 
-        if (getmd->eq) {
-                msg->ev.type = PTL_EVENT_REPLY;
-                msg->ev.initiator.nid = peer_nid;
-                msg->ev.initiator.pid = 0;      /* XXX FIXME!!! */
-                msg->ev.rlength = msg->ev.mlength = getmd->length;
-                msg->ev.offset = 0;
+        msg->ev.type = PTL_EVENT_REPLY;
+        msg->ev.initiator.nid = peer_nid;
+        msg->ev.initiator.pid = 0;      /* XXX FIXME!!! */
+        msg->ev.rlength = msg->ev.mlength = getmd->length;
+        msg->ev.offset = 0;
 
-                lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc);
-        }
+        lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc);
 
         ni->counters.recv_count++;
         ni->counters.recv_length += getmd->length;
@@ -1228,7 +1239,9 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
         state_unlock(nal, &flags);
 
         return msg;
-        
+
+ drop_msg:
+        lib_msg_free(nal, msg);
  drop:
         nal->ni.counters.drop_count++;
         nal->ni.counters.drop_length += getmd->length;
@@ -1238,7 +1251,8 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
         return NULL;
 }
 
-int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int 
+do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret)
 {
         /*
          * Incoming:
@@ -1252,15 +1266,15 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
          * Outgoing:
          */
 
-        PtlGet_in *args = v_args;
-        PtlGet_out *ret = v_ret;
-        ptl_hdr_t hdr;
-        lib_msg_t *msg = NULL;
-        lib_ni_t *ni = &nal->ni;
+        PtlGet_in        *args = v_args;
         ptl_process_id_t *id = &args->target_in;
-        lib_md_t *md;
-        unsigned long flags;
-        int           rc;
+        PtlGet_out       *ret = v_ret;
+        lib_ni_t         *ni = &nal->ni;
+        lib_msg_t        *msg;
+        ptl_hdr_t         hdr;
+        lib_md_t         *md;
+        unsigned long     flags;
+        int               rc;
         
         if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
             fail_peer (nal, id->nid, 1))           /* shall we now? */
@@ -1269,16 +1283,24 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
                        nal->ni.nid, id->nid);
                 return (ret->rc = PTL_INV_PROC);
         }
-        
+
+        msg = lib_msg_alloc(nal);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping GET to "LPU64": ENOMEM on lib_msg_t\n",
+                       ni->nid, id->nid);
+                return (ret->rc = PTL_NOSPACE);
+        }
+
         state_lock(nal, &flags);
+
         md = ptl_handle2md(&args->md_in, nal);
         if (md == NULL || !md->threshold) {
+                lib_msg_free(nal, msg);
                 state_unlock(nal, &flags);
+
                 return ret->rc = PTL_INV_MD;
         }
 
-        LASSERT (md->offset == 0);
-
         CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid,
                (unsigned long)id->pid);
 
@@ -1299,51 +1321,33 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
         hdr.msg.get.src_offset = HTON__u32 (args->offset_in);
         hdr.msg.get.sink_length = HTON__u32 (md->length);
 
-        ni->counters.send_count++;
+        lib_commit_md(nal, md, msg);
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR("do_PtlGet: BAD - could not allocate cookie!\n");
-                state_unlock(nal, &flags);
-                return ret->rc = PTL_NOSPACE;
-        }
+        msg->ev.type = PTL_EVENT_SENT;
+        msg->ev.initiator.nid = ni->nid;
+        msg->ev.initiator.pid = ni->pid;
+        msg->ev.portal = args->portal_in;
+        msg->ev.match_bits = args->match_bits_in;
+        msg->ev.rlength = md->length;
+        msg->ev.mlength = md->length;
+        msg->ev.offset = args->offset_in;
+        msg->ev.hdr_data = 0;
 
-        /*
-         * If this memory descriptor has an event queue associated with
-         * it we must allocate a message state object that will record
-         * the information to be filled in once the message has been
-         * completed.  More information is in the do_PtlPut() comments.
-         *
-         * NB. We're now committed to the GET, since we just marked the MD
-         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
-         * PtlMDUnlink()) expect a completion event to tell them when the
-         * MD becomes idle. 
-         */
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_SENT;
-                msg->ev.initiator.nid = ni->nid;
-                msg->ev.initiator.pid = ni->pid;
-                msg->ev.portal = args->portal_in;
-                msg->ev.match_bits = args->match_bits_in;
-                msg->ev.rlength = md->length;
-                msg->ev.mlength = md->length;
-                msg->ev.offset = args->offset_in;
-                msg->ev.hdr_data = 0;
-
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-        }
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+
+        ni->counters.send_count++;
 
         state_unlock(nal, &flags);
 
         rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET,
                        id->nid, id->pid, NULL, 0, 0);
         if (rc != PTL_OK) {
-                /* get_new_msg() committed us to sending by decrementing
-                 * md->threshold, so we have to act like we did send, but
-                 * the network dropped it. */
-                lib_finalize (nal, private, msg);
+                CERROR(LPU64": error sending GET to "LPU64": %d\n",
+                       ni->nid, id->nid, rc);
+                lib_finalize (nal, private, msg, rc);
         }
         
+        /* completion will be signalled by an event */
         return ret->rc = PTL_OK;
 }
 
index 9840ff5..04c69b1 100644 (file)
 
 #include <portals/lib-p30.h>
 
-int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
+void
+lib_enq_event_locked (nal_cb_t *nal, void *private, 
+                      lib_eq_t *eq, ptl_event_t *ev)
 {
-        lib_md_t     *md;
-        lib_eq_t     *eq;
+        ptl_event_t  *eq_slot;
         int           rc;
+        
+        ev->sequence = eq->sequence++; /* Allocate the next queue slot */
+
+        /* size must be a power of 2 to handle a wrapped sequence # */
+        LASSERT (eq->size != 0 &&
+                 eq->size == LOWEST_BIT_SET (eq->size));
+        eq_slot = eq->base + (ev->sequence & (eq->size - 1));
+
+        /* Copy the event into the allocated slot, ensuring all the rest of
+         * the event's contents have been copied _before_ the sequence
+         * number gets updated.  A processes 'getting' an event waits on
+         * the next queue slot's sequence to be 'new'.  When it is, _all_
+         * other event fields had better be consistent.  I assert
+         * 'sequence' is the last member, so I only need a 2 stage copy. */
+
+        LASSERT(sizeof (ptl_event_t) ==
+                offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
+
+        rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
+                            offsetof (ptl_event_t, sequence));
+        LASSERT (rc == PTL_OK);
+
+#ifdef __KERNEL__
+        barrier();
+#endif
+        /* Updating the sequence number is what makes the event 'new' NB if
+         * the cb_write below isn't atomic, this could cause a race with
+         * PtlEQGet */
+        rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
+                           (void *)&ev->sequence,sizeof (ev->sequence));
+        LASSERT (rc == PTL_OK);
+
+#ifdef __KERNEL__
+        barrier();
+#endif
+
+        if (nal->cb_callback != NULL)
+                nal->cb_callback(nal, private, eq, ev);
+        else if (eq->event_callback != NULL)
+                eq->event_callback(ev);
+}
+
+void 
+lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
+{
+        lib_md_t     *md;
+        int           unlink;
         unsigned long flags;
+        int           rc;
+        ptl_hdr_t     ack;
 
         /* ni went down while processing this message */
-        if (nal->ni.up == 0) {
-                return -1;
-        }
+        if (nal->ni.up == 0)
+                return;
 
         if (msg == NULL)
-                return 0;
+                return;
 
-        rc = 0;
-        if (msg->send_ack) {
-                ptl_hdr_t ack;
+        /* Only send an ACK if the PUT completed successfully */
+        if (status == PTL_OK &&
+            !ptl_is_wire_handle_none(&msg->ack_wmd)) {
 
-                LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd));
+                LASSERT(msg->ev.type == PTL_EVENT_PUT);
 
                 memset (&ack, 0, sizeof (ack));
                 ack.type     = HTON__u32 (PTL_MSG_ACK);
-                ack.dest_nid = HTON__u64 (msg->nid);
+                ack.dest_nid = HTON__u64 (msg->ev.initiator.nid);
                 ack.src_nid  = HTON__u64 (nal->ni.nid);
-                ack.dest_pid = HTON__u32 (msg->pid);
+                ack.dest_pid = HTON__u32 (msg->ev.initiator.pid);
                 ack.src_pid  = HTON__u32 (nal->ni.pid);
                 ack.payload_length = 0;
 
@@ -66,92 +115,35 @@ int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
                 ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength);
 
                 rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK,
-                               msg->nid, msg->pid, NULL, 0, 0);
-                /* If this send fails, there's nothing else to clean up */
+                               msg->ev.initiator.nid, msg->ev.initiator.pid, 
+                               NULL, 0, 0);
+                if (rc != PTL_OK) {
+                        /* send failed: there's nothing else to clean up. */
+                        CERROR("Error %d sending ACK to "LPX64"\n", 
+                               rc, msg->ev.initiator.nid);
+                }
         }
 
         md = msg->md;
-        LASSERT (md->pending > 0);  /* I've not dropped my ref yet */
-        eq = md->eq;
 
         state_lock(nal, &flags);
 
-        if (eq != NULL) {
-                ptl_event_t  *ev = &msg->ev;
-                ptl_event_t  *eq_slot;
-
-                /* I have to hold the lock while I bump the sequence number
-                 * and copy the event into the queue.  If not, and I was
-                 * interrupted after bumping the sequence number, other
-                 * events could fill the queue, including the slot I just
-                 * allocated to this event.  On resuming, I would overwrite
-                 * a more 'recent' event with old event state, and
-                 * processes taking events off the queue would not detect
-                 * overflow correctly.
-                 */
-
-                ev->sequence = eq->sequence++;/* Allocate the next queue slot */
-
-                /* size must be a power of 2 to handle a wrapped sequence # */
-                LASSERT (eq->size != 0 &&
-                         eq->size == LOWEST_BIT_SET (eq->size));
-                eq_slot = eq->base + (ev->sequence & (eq->size - 1));
-
-                /* Invalidate unlinked_me unless this is the last
-                 * event for an auto-unlinked MD.  Note that if md was
-                 * auto-unlinked, md->pending can only decrease
-                 */
-                if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */
-                    md->pending != 1)                       /* not last ref */
-                        ev->unlinked_me = PTL_HANDLE_NONE;
-
-                /* Copy the event into the allocated slot, ensuring all the
-                 * rest of the event's contents have been copied _before_
-                 * the sequence number gets updated.  A processes 'getting'
-                 * an event waits on the next queue slot's sequence to be
-                 * 'new'.  When it is, _all_ other event fields had better
-                 * be consistent.  I assert 'sequence' is the last member,
-                 * so I only need a 2 stage copy.
-                 */
-                LASSERT(sizeof (ptl_event_t) ==
-                        offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
-
-                rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
-                                    offsetof (ptl_event_t, sequence));
-                LASSERT (rc == 0);
-
-#ifdef __KERNEL__
-                barrier();
-#endif
-                /* Updating the sequence number is what makes the event 'new' */
-
-                /* cb_write is not necessarily atomic, so this could
-                   cause a race with PtlEQGet */
-                rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
-                                   (void *)&ev->sequence,sizeof (ev->sequence));
-                LASSERT (rc == 0);
+        /* Now it's safe to drop my caller's ref */
+        md->pending--;
+        LASSERT (md->pending >= 0);
 
-#ifdef __KERNEL__
-                barrier();
-#endif
+        /* Should I unlink this MD? */
+        unlink = (md->pending == 0 &&           /* No other refs */
+                  (md->threshold == 0 ||        /* All ops done */
+                   md->md_flags & PTL_MD_FLAG_UNLINK) != 0); /* black spot */
 
-                /* I must also ensure that (a) callbacks are made in the
-                 * same order as the events land in the queue, and (b) the
-                 * callback occurs before the event can be removed from the
-                 * queue, so I can't drop the lock during the callback. */
-                if (nal->cb_callback != NULL)
-                        nal->cb_callback(nal, private, eq, ev);
-                else  if (eq->event_callback != NULL)
-                        (void)((eq->event_callback) (ev));
-        }
+        msg->ev.status = status;
+        msg->ev.unlinked = unlink;
 
-        LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 ||
-                 (md->md_flags & PTL_MD_FLAG_UNLINK) != 0);
+        if (md->eq != NULL)
+                lib_enq_event_locked(nal, private, md->eq, &msg->ev);
 
-        md->pending--;
-        if (md->pending == 0 && /* no more outstanding operations on this md */
-            (md->threshold == 0 ||              /* done its business */
-             (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */
+        if (unlink)
                 lib_md_unlink(nal, md);
 
         list_del (&msg->msg_list);
@@ -159,6 +151,4 @@ int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
         lib_msg_free(nal, msg);
 
         state_unlock(nal, &flags);
-
-        return rc;
 }
index dc427b0..6035ca1 100644 (file)
@@ -1,5 +1,9 @@
 CPPFLAGS=
 INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
-lib_LIBRARIES = libtcpnal.a
+noinst_LIBRARIES = libtcpnal.a
 pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
 libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+
+if LIBLUSTRE
+libtcpnal_a_CFLAGS = -fPIC
+endif
index 0b4940f..9a90ab8 100644 (file)
@@ -6,6 +6,9 @@
  *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
  */
 
+#ifndef TCPNAL_PROCBRIDGE_H
+#define TCPNAL_PROCBRIDGE_H
+
 #include <portals/lib-p30.h>
 
 typedef struct bridge {
@@ -27,3 +30,5 @@ nal_t *bridge_init(ptl_interface_t nal,
 
 typedef int (*nal_initialize)(bridge);
 extern nal_initialize nal_table[PTL_IFACE_MAX];
+
+#endif
index 29e75be..ca6999a 100644 (file)
@@ -309,7 +309,8 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
  */
 connection force_tcp_connection(manager m,
                                 unsigned int ip,
-                                unsigned short port)
+                                unsigned short port,
+                                procbridge pb)
 {
     connection conn;
     struct sockaddr_in addr;
@@ -357,6 +358,10 @@ connection force_tcp_connection(manager m,
             exit(-1);
 
         conn = allocate_connection(m, ip, port, fd);
+
+        /* let nal thread know this event right away */
+        if (conn)
+                procbridge_wakeup_nal(pb);
     }
 
     pthread_mutex_unlock(&m->conn_lock);
index fb1eaab..343ffa6 100644 (file)
@@ -7,6 +7,7 @@
  */
 
 #include <table.h>
+#include <procbridge.h>
 
 typedef struct manager {
     table connections;
@@ -26,7 +27,8 @@ typedef struct connection {
     manager m;
 } *connection;
 
-connection force_tcp_connection(manager m, unsigned int ip, unsigned int short);
+connection force_tcp_connection(manager m, unsigned int ip, unsigned int short,
+                                procbridge pb);
 manager init_connections(unsigned short, int (*f)(void *, void *), void *);
 void remove_connection(void *arg);
 void shutdown_connections(manager m);
index 2a3fbd8..bddfe9a 100644 (file)
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
+#ifndef __CYGWIN__
+#include <syscall.h>
+#endif
+#include <sys/socket.h>
 #include <procbridge.h>
 #include <pqtimer.h>
 #include <dispatch.h>
 #include <errno.h>
 
 
+/* XXX CFS workaround, to give a chance to let nal thread wake up
+ * from waiting in select
+ */
+static int procbridge_notifier_handler(void *arg)
+{
+    static char buf[8];
+    procbridge p = (procbridge) arg;
+
+    syscall(SYS_read, p->notifier[1], buf, sizeof(buf));
+    return 1;
+}
+
+void procbridge_wakeup_nal(procbridge p)
+{
+    static char buf[8];
+    syscall(SYS_write, p->notifier[0], buf, sizeof(buf));
+}
+
 /* Function: forward
  * Arguments: nal_t *nal: pointer to my top-side nal structure
  *            id: the command to pass to the lower layer
@@ -79,6 +101,7 @@ static int procbridge_shutdown(nal_t *n, int ni)
     procbridge p=(procbridge)b->local;
 
     p->nal_flags |= NAL_FLAG_STOPPING;
+    procbridge_wakeup_nal(p);
 
     do {
         pthread_mutex_lock(&p->mutex);
@@ -104,6 +127,12 @@ static int procbridge_validate(nal_t *nal, void *base, size_t extent)
 }
 
 
+/* FIXME cfs temporary workaround! FIXME
+ * global time out value
+ */
+int __tcpnal_eqwait_timeout_value = 0;
+int __tcpnal_eqwait_timedout = 0;
+
 /* Function: yield
  * Arguments:  pid:
  *
@@ -118,7 +147,19 @@ static void procbridge_yield(nal_t *n)
     procbridge p=(procbridge)b->local;
 
     pthread_mutex_lock(&p->mutex);
-    pthread_cond_wait(&p->cond,&p->mutex);
+    if (!__tcpnal_eqwait_timeout_value) {
+        pthread_cond_wait(&p->cond,&p->mutex);
+    } else {
+        struct timeval now;
+        struct timespec timeout;
+
+        gettimeofday(&now, NULL);
+        timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value;
+        timeout.tv_nsec = now.tv_usec * 1000;
+
+        __tcpnal_eqwait_timedout =
+                pthread_cond_timedwait(&p->cond, &p->mutex, &timeout);
+    }
     pthread_mutex_unlock(&p->mutex);
 }
 
@@ -194,6 +235,19 @@ nal_t *procbridge_interface(int num_interface,
     p->nal_flags = 0;
     pthread_mutex_init(&p->nal_cb_lock, 0);
 
+    /* initialize notifier */
+    if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) {
+        perror("socketpair failed");
+        return NULL;
+    }
+
+    if (!register_io_handler(p->notifier[1], READ_HANDLER,
+                procbridge_notifier_handler, p)) {
+        perror("fail to register notifier handler");
+        return NULL;
+    }
+
+    /* create nal thread */
     if (pthread_create(&p->t, NULL, nal_thread, &args)) {
         perror("nal_init: pthread_create");
         return(NULL);
index 317e22f..965f83d 100644 (file)
@@ -25,6 +25,9 @@ typedef struct procbridge {
     pthread_cond_t cond;
     pthread_mutex_t mutex;
 
+    /* socket pair used to notify nal thread */
+    int notifier[2];
+
     int nal_flags;
 
     pthread_mutex_t nal_cb_lock;
@@ -51,5 +54,6 @@ extern nal_t *procbridge_interface(int num_interface,
                                    ptl_pt_index_t ptl_size,
                                    ptl_ac_index_t acl_size,
                                    ptl_pid_t requested_pid);
+extern void procbridge_wakeup_nal(procbridge p);
 
 #endif
index 2627253..2a5ba0d 100644 (file)
 /* the following functions are stubs to satisfy the nal definition
    without doing anything particularily useful*/
 
-static int nal_write(nal_cb_t *nal,
-                     void *private,
-                     user_ptr dst_addr,
-                     void *src_addr,
-                     size_t len)
+static ptl_err_t nal_write(nal_cb_t *nal,
+                           void *private,
+                           user_ptr dst_addr,
+                           void *src_addr,
+                           size_t len)
 {
     memcpy(dst_addr, src_addr, len);
-    return 0;
+    return PTL_OK;
 }
 
-static int nal_read(nal_cb_t * nal,
-                    void *private,
-                   void *dst_addr,
-                   user_ptr src_addr,
-                   size_t len)
+static ptl_err_t nal_read(nal_cb_t * nal,
+                          void *private,
+                          void *dst_addr,
+                          user_ptr src_addr,
+                          size_t len)
 {
        memcpy(dst_addr, src_addr, len);
-       return 0;
+       return PTL_OK;
 }
 
 static void *nal_malloc(nal_cb_t *nal,
index fe24efc..c4ccae1 100644 (file)
@@ -126,15 +126,6 @@ void select_timer_block(when until)
         timeout_pointer=&timeout;
     } else timeout_pointer=0;
 
-
-    /* FIXME
-     * temporarily add timer for endless waiting problem.
-     * FIXME
-     */
-    timeout.tv_sec = 1;
-    timeout.tv_usec = 0;
-    timeout_pointer=&timeout;
-
     FD_ZERO(&fds[0]);
     FD_ZERO(&fds[1]);
     FD_ZERO(&fds[2]);
index dc427b0..6035ca1 100644 (file)
@@ -1,5 +1,9 @@
 CPPFLAGS=
 INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
-lib_LIBRARIES = libtcpnal.a
+noinst_LIBRARIES = libtcpnal.a
 pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
 libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+
+if LIBLUSTRE
+libtcpnal_a_CFLAGS = -fPIC
+endif
index 0b4940f..9a90ab8 100644 (file)
@@ -6,6 +6,9 @@
  *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
  */
 
+#ifndef TCPNAL_PROCBRIDGE_H
+#define TCPNAL_PROCBRIDGE_H
+
 #include <portals/lib-p30.h>
 
 typedef struct bridge {
@@ -27,3 +30,5 @@ nal_t *bridge_init(ptl_interface_t nal,
 
 typedef int (*nal_initialize)(bridge);
 extern nal_initialize nal_table[PTL_IFACE_MAX];
+
+#endif
index 29e75be..ca6999a 100644 (file)
@@ -309,7 +309,8 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
  */
 connection force_tcp_connection(manager m,
                                 unsigned int ip,
-                                unsigned short port)
+                                unsigned short port,
+                                procbridge pb)
 {
     connection conn;
     struct sockaddr_in addr;
@@ -357,6 +358,10 @@ connection force_tcp_connection(manager m,
             exit(-1);
 
         conn = allocate_connection(m, ip, port, fd);
+
+        /* let nal thread know this event right away */
+        if (conn)
+                procbridge_wakeup_nal(pb);
     }
 
     pthread_mutex_unlock(&m->conn_lock);
index fb1eaab..343ffa6 100644 (file)
@@ -7,6 +7,7 @@
  */
 
 #include <table.h>
+#include <procbridge.h>
 
 typedef struct manager {
     table connections;
@@ -26,7 +27,8 @@ typedef struct connection {
     manager m;
 } *connection;
 
-connection force_tcp_connection(manager m, unsigned int ip, unsigned int short);
+connection force_tcp_connection(manager m, unsigned int ip, unsigned int short,
+                                procbridge pb);
 manager init_connections(unsigned short, int (*f)(void *, void *), void *);
 void remove_connection(void *arg);
 void shutdown_connections(manager m);
index 2a3fbd8..bddfe9a 100644 (file)
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
+#ifndef __CYGWIN__
+#include <syscall.h>
+#endif
+#include <sys/socket.h>
 #include <procbridge.h>
 #include <pqtimer.h>
 #include <dispatch.h>
 #include <errno.h>
 
 
+/* XXX CFS workaround, to give a chance to let nal thread wake up
+ * from waiting in select
+ */
+static int procbridge_notifier_handler(void *arg)
+{
+    static char buf[8];
+    procbridge p = (procbridge) arg;
+
+    syscall(SYS_read, p->notifier[1], buf, sizeof(buf));
+    return 1;
+}
+
+void procbridge_wakeup_nal(procbridge p)
+{
+    static char buf[8];
+    syscall(SYS_write, p->notifier[0], buf, sizeof(buf));
+}
+
 /* Function: forward
  * Arguments: nal_t *nal: pointer to my top-side nal structure
  *            id: the command to pass to the lower layer
@@ -79,6 +101,7 @@ static int procbridge_shutdown(nal_t *n, int ni)
     procbridge p=(procbridge)b->local;
 
     p->nal_flags |= NAL_FLAG_STOPPING;
+    procbridge_wakeup_nal(p);
 
     do {
         pthread_mutex_lock(&p->mutex);
@@ -104,6 +127,12 @@ static int procbridge_validate(nal_t *nal, void *base, size_t extent)
 }
 
 
+/* FIXME cfs temporary workaround! FIXME
+ * global time out value
+ */
+int __tcpnal_eqwait_timeout_value = 0;
+int __tcpnal_eqwait_timedout = 0;
+
 /* Function: yield
  * Arguments:  pid:
  *
@@ -118,7 +147,19 @@ static void procbridge_yield(nal_t *n)
     procbridge p=(procbridge)b->local;
 
     pthread_mutex_lock(&p->mutex);
-    pthread_cond_wait(&p->cond,&p->mutex);
+    if (!__tcpnal_eqwait_timeout_value) {
+        pthread_cond_wait(&p->cond,&p->mutex);
+    } else {
+        struct timeval now;
+        struct timespec timeout;
+
+        gettimeofday(&now, NULL);
+        timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value;
+        timeout.tv_nsec = now.tv_usec * 1000;
+
+        __tcpnal_eqwait_timedout =
+                pthread_cond_timedwait(&p->cond, &p->mutex, &timeout);
+    }
     pthread_mutex_unlock(&p->mutex);
 }
 
@@ -194,6 +235,19 @@ nal_t *procbridge_interface(int num_interface,
     p->nal_flags = 0;
     pthread_mutex_init(&p->nal_cb_lock, 0);
 
+    /* initialize notifier */
+    if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) {
+        perror("socketpair failed");
+        return NULL;
+    }
+
+    if (!register_io_handler(p->notifier[1], READ_HANDLER,
+                procbridge_notifier_handler, p)) {
+        perror("fail to register notifier handler");
+        return NULL;
+    }
+
+    /* create nal thread */
     if (pthread_create(&p->t, NULL, nal_thread, &args)) {
         perror("nal_init: pthread_create");
         return(NULL);
index 317e22f..965f83d 100644 (file)
@@ -25,6 +25,9 @@ typedef struct procbridge {
     pthread_cond_t cond;
     pthread_mutex_t mutex;
 
+    /* socket pair used to notify nal thread */
+    int notifier[2];
+
     int nal_flags;
 
     pthread_mutex_t nal_cb_lock;
@@ -51,5 +54,6 @@ extern nal_t *procbridge_interface(int num_interface,
                                    ptl_pt_index_t ptl_size,
                                    ptl_ac_index_t acl_size,
                                    ptl_pid_t requested_pid);
+extern void procbridge_wakeup_nal(procbridge p);
 
 #endif
index 2627253..2a5ba0d 100644 (file)
 /* the following functions are stubs to satisfy the nal definition
    without doing anything particularily useful*/
 
-static int nal_write(nal_cb_t *nal,
-                     void *private,
-                     user_ptr dst_addr,
-                     void *src_addr,
-                     size_t len)
+static ptl_err_t nal_write(nal_cb_t *nal,
+                           void *private,
+                           user_ptr dst_addr,
+                           void *src_addr,
+                           size_t len)
 {
     memcpy(dst_addr, src_addr, len);
-    return 0;
+    return PTL_OK;
 }
 
-static int nal_read(nal_cb_t * nal,
-                    void *private,
-                   void *dst_addr,
-                   user_ptr src_addr,
-                   size_t len)
+static ptl_err_t nal_read(nal_cb_t * nal,
+                          void *private,
+                          void *dst_addr,
+                          user_ptr src_addr,
+                          size_t len)
 {
        memcpy(dst_addr, src_addr, len);
-       return 0;
+       return PTL_OK;
 }
 
 static void *nal_malloc(nal_cb_t *nal,
index fe24efc..c4ccae1 100644 (file)
@@ -126,15 +126,6 @@ void select_timer_block(when until)
         timeout_pointer=&timeout;
     } else timeout_pointer=0;
 
-
-    /* FIXME
-     * temporarily add timer for endless waiting problem.
-     * FIXME
-     */
-    timeout.tv_sec = 1;
-    timeout.tv_usec = 0;
-    timeout_pointer=&timeout;
-
     FD_ZERO(&fds[0]);
     FD_ZERO(&fds[1]);
     FD_ZERO(&fds[2]);
index 1041d1d..0c47f42 100644 (file)
  *
  * sends a packet to the peer, after insuring that a connection exists
  */
-int tcpnal_send(nal_cb_t *n,
-               void *private,
-               lib_msg_t *cookie,
-               ptl_hdr_t *hdr,
-               int type,
-               ptl_nid_t nid,
-               ptl_pid_t pid,
-                unsigned int niov,
-                struct iovec *iov,
-               size_t len)
+ptl_err_t tcpnal_send(nal_cb_t *n,
+                      void *private,
+                      lib_msg_t *cookie,
+                      ptl_hdr_t *hdr,
+                      int type,
+                      ptl_nid_t nid,
+                      ptl_pid_t pid,
+                      unsigned int niov,
+                      struct iovec *iov,
+                      size_t offset,
+                      size_t len)
 {
     connection c;
     bridge b=(bridge)n->nal_data;
     struct iovec tiov[257];
     static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER;
-    int   rc;
+    ptl_err_t rc = PTL_OK;
+    int   sysrc;
     int   total;
+    int   ntiov;
     int i;
 
     if (!(c=force_tcp_connection((manager)b->lower,
                                  PNAL_IP(nid,b),
-                                 PNAL_PORT(nid,pid)))) 
-        return(1);
+                                 PNAL_PORT(nid,pid),
+                                 b->local)))
+        return(PTL_FAIL);
 
-#if 0
     /* TODO: these results should be checked. furthermore, provision
        must be made for the SIGPIPE which is delivered when
        writing on a tcp socket which has closed underneath
        the application. there is a linux flag in the sendmsg
        call which turns off the signally behaviour, but its
        nonstandard */
-    syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
-    LASSERT (niov <= 1);
-    if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
-#else
+
     LASSERT (niov <= 256);
 
     tiov[0].iov_base = hdr;
     tiov[0].iov_len = sizeof(ptl_hdr_t);
+    ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len);
 
-    if (niov > 0)
-            memcpy(&tiov[1], iov, niov * sizeof(struct iovec));
     pthread_mutex_lock(&send_lock);
 #if 1
-    for (i = total = 0; i <= niov; i++)
+    for (i = total = 0; i < ntiov; i++)
             total += tiov[i].iov_len;
     
-    rc = syscall(SYS_writev, c->fd, tiov, niov+1);
-    if (rc != total) {
+    sysrc = syscall(SYS_writev, c->fd, tiov, ntiov);
+    if (sysrc != total) {
             fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
                      rc, total, errno);
-            abort();
+            rc = PTL_FAIL;
     }
 #else
-    for (i = total = 0; i <= niov; i++) {
+    for (i = total = 0; i <= ntiov; i++) {
             rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0);
             
             if (rc != tiov[i].iov_len) {
                     fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
                              rc, tiov[i].iov_len, errno);
-                    abort();
+                    rc = PTL_FAIL;
+                    break;
             }
-            total != rc;
+            total += rc;
     }
 #endif
 #if 0
@@ -130,10 +130,14 @@ int tcpnal_send(nal_cb_t *n,
              total, niov + 1);
 #endif
     pthread_mutex_unlock(&send_lock);
-#endif
-    lib_finalize(n, private, cookie);
-        
-    return(0);
+
+    if (rc == PTL_OK) {
+            /* NB the NAL only calls lib_finalize() if it returns PTL_OK
+             * from cb_send() */
+            lib_finalize(n, private, cookie, PTL_OK);
+    }
+
+    return(rc);
 }
 
 
@@ -150,15 +154,18 @@ int tcpnal_send(nal_cb_t *n,
  * blocking read of the requested data. must drain out the
  * difference of mainpulated and requested lengths from the network
  */
-int tcpnal_recv(nal_cb_t *n,
-               void *private,
-               lib_msg_t *cookie,
-                unsigned int niov,
-                struct iovec *iov,
-               size_t mlen,
-               size_t rlen)
+ptl_err_t tcpnal_recv(nal_cb_t *n,
+                      void *private,
+                      lib_msg_t *cookie,
+                      unsigned int niov,
+                      struct iovec *iov,
+                      size_t offset,
+                      size_t mlen,
+                      size_t rlen)
 
 {
+    struct iovec tiov[256];
+    int ntiov;
     int i;
 
     if (!niov)
@@ -168,16 +175,19 @@ int tcpnal_recv(nal_cb_t *n,
     LASSERT(rlen);
     LASSERT(rlen >= mlen);
 
+    ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen);
+    
     /* FIXME
      * 1. Is this effecient enough? change to use readv() directly?
      * 2. need check return from read_connection()
      * - MeiJia
      */
-    for (i = 0; i < niov; i++)
-        read_connection(private, iov[i].iov_base, iov[i].iov_len);
+    for (i = 0; i < ntiov; i++)
+        read_connection(private, tiov[i].iov_base, tiov[i].iov_len);
 
 finalize:
-    lib_finalize(n, private, cookie);
+    /* FIXME; we always assume success here... */
+    lib_finalize(n, private, cookie, PTL_OK);
 
     if (mlen!=rlen){
         char *trash=malloc(rlen-mlen);
@@ -187,7 +197,7 @@ finalize:
         free(trash);
     }
 
-    return(rlen);
+    return(PTL_OK);
 }
 
 
index 1041d1d..0c47f42 100644 (file)
  *
  * sends a packet to the peer, after insuring that a connection exists
  */
-int tcpnal_send(nal_cb_t *n,
-               void *private,
-               lib_msg_t *cookie,
-               ptl_hdr_t *hdr,
-               int type,
-               ptl_nid_t nid,
-               ptl_pid_t pid,
-                unsigned int niov,
-                struct iovec *iov,
-               size_t len)
+ptl_err_t tcpnal_send(nal_cb_t *n,
+                      void *private,
+                      lib_msg_t *cookie,
+                      ptl_hdr_t *hdr,
+                      int type,
+                      ptl_nid_t nid,
+                      ptl_pid_t pid,
+                      unsigned int niov,
+                      struct iovec *iov,
+                      size_t offset,
+                      size_t len)
 {
     connection c;
     bridge b=(bridge)n->nal_data;
     struct iovec tiov[257];
     static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER;
-    int   rc;
+    ptl_err_t rc = PTL_OK;
+    int   sysrc;
     int   total;
+    int   ntiov;
     int i;
 
     if (!(c=force_tcp_connection((manager)b->lower,
                                  PNAL_IP(nid,b),
-                                 PNAL_PORT(nid,pid)))) 
-        return(1);
+                                 PNAL_PORT(nid,pid),
+                                 b->local)))
+        return(PTL_FAIL);
 
-#if 0
     /* TODO: these results should be checked. furthermore, provision
        must be made for the SIGPIPE which is delivered when
        writing on a tcp socket which has closed underneath
        the application. there is a linux flag in the sendmsg
        call which turns off the signally behaviour, but its
        nonstandard */
-    syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
-    LASSERT (niov <= 1);
-    if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
-#else
+
     LASSERT (niov <= 256);
 
     tiov[0].iov_base = hdr;
     tiov[0].iov_len = sizeof(ptl_hdr_t);
+    ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len);
 
-    if (niov > 0)
-            memcpy(&tiov[1], iov, niov * sizeof(struct iovec));
     pthread_mutex_lock(&send_lock);
 #if 1
-    for (i = total = 0; i <= niov; i++)
+    for (i = total = 0; i < ntiov; i++)
             total += tiov[i].iov_len;
     
-    rc = syscall(SYS_writev, c->fd, tiov, niov+1);
-    if (rc != total) {
+    sysrc = syscall(SYS_writev, c->fd, tiov, ntiov);
+    if (sysrc != total) {
             fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
                      rc, total, errno);
-            abort();
+            rc = PTL_FAIL;
     }
 #else
-    for (i = total = 0; i <= niov; i++) {
+    for (i = total = 0; i <= ntiov; i++) {
             rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0);
             
             if (rc != tiov[i].iov_len) {
                     fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
                              rc, tiov[i].iov_len, errno);
-                    abort();
+                    rc = PTL_FAIL;
+                    break;
             }
-            total != rc;
+            total += rc;
     }
 #endif
 #if 0
@@ -130,10 +130,14 @@ int tcpnal_send(nal_cb_t *n,
              total, niov + 1);
 #endif
     pthread_mutex_unlock(&send_lock);
-#endif
-    lib_finalize(n, private, cookie);
-        
-    return(0);
+
+    if (rc == PTL_OK) {
+            /* NB the NAL only calls lib_finalize() if it returns PTL_OK
+             * from cb_send() */
+            lib_finalize(n, private, cookie, PTL_OK);
+    }
+
+    return(rc);
 }
 
 
@@ -150,15 +154,18 @@ int tcpnal_send(nal_cb_t *n,
  * blocking read of the requested data. must drain out the
  * difference of mainpulated and requested lengths from the network
  */
-int tcpnal_recv(nal_cb_t *n,
-               void *private,
-               lib_msg_t *cookie,
-                unsigned int niov,
-                struct iovec *iov,
-               size_t mlen,
-               size_t rlen)
+ptl_err_t tcpnal_recv(nal_cb_t *n,
+                      void *private,
+                      lib_msg_t *cookie,
+                      unsigned int niov,
+                      struct iovec *iov,
+                      size_t offset,
+                      size_t mlen,
+                      size_t rlen)
 
 {
+    struct iovec tiov[256];
+    int ntiov;
     int i;
 
     if (!niov)
@@ -168,16 +175,19 @@ int tcpnal_recv(nal_cb_t *n,
     LASSERT(rlen);
     LASSERT(rlen >= mlen);
 
+    ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen);
+    
     /* FIXME
      * 1. Is this effecient enough? change to use readv() directly?
      * 2. need check return from read_connection()
      * - MeiJia
      */
-    for (i = 0; i < niov; i++)
-        read_connection(private, iov[i].iov_base, iov[i].iov_len);
+    for (i = 0; i < ntiov; i++)
+        read_connection(private, tiov[i].iov_base, tiov[i].iov_len);
 
 finalize:
-    lib_finalize(n, private, cookie);
+    /* FIXME; we always assume success here... */
+    lib_finalize(n, private, cookie, PTL_OK);
 
     if (mlen!=rlen){
         char *trash=malloc(rlen-mlen);
@@ -187,7 +197,7 @@ finalize:
         free(trash);
     }
 
-    return(rlen);
+    return(PTL_OK);
 }
 
 
index f1878df..6c31b3d 100644 (file)
@@ -3,17 +3,18 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-
 COMPILE = $(CC) -Wall -g -I$(srcdir)/../include
 LINK = $(CC) -o $@
 
 if LIBLUSTRE
-tmp=
+
+noinst_LIBRARIES = libuptlctl.a
+libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+libuptlctl_a_CFLAGS = -fPIC
+
 else
-tmp=gmnalnid
-endif
 
-sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck $(tmp)
+sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
 lib_LIBRARIES = libptlctl.a
 
 acceptor_SOURCES = acceptor.c # -lefence
@@ -33,3 +34,4 @@ debugctl_LDADD = -L. -lptlctl -lncurses # -lefence
 debugctl_DEPENDENCIES = libptlctl.a
 
 routerstat_SOURCES = routerstat.c
+endif
index c6628ff..58a408a 100644 (file)
@@ -23,7 +23,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <syscall.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <portals/api-support.h>
 #include <portals/ptlctl.h>
 
+#ifndef __CYGWIN__
+ #include <syscall.h>
+#else
+ #include <windows.h>
+ #include <windef.h>
+#endif
+
+static ioc_handler_t  do_ioctl;                 /* forward ref */
+static ioc_handler_t *current_ioc_handler = &do_ioctl;
+
 struct ioc_dev {
        const char * dev_name;
        int dev_fd;
@@ -48,7 +57,16 @@ struct dump_hdr {
        int opc;
 };
 
-char * dump_filename;
+char *dump_filename;
+
+void
+set_ioc_handler (ioc_handler_t *handler)
+{
+        if (handler == NULL)
+                current_ioc_handler = do_ioctl;
+        else
+                current_ioc_handler = handler;
+}
 
 static int
 open_ioc_dev(int dev_id) 
@@ -115,7 +133,7 @@ dump(int dev_id, int opc, void *buf)
 {
        FILE *fp;
        struct dump_hdr dump_hdr;
-       struct portal_ioctl_hdr * ioc_hdr = (struct  portal_ioctl_hdr *) buf;
+        struct portal_ioctl_hdr * ioc_hdr = (struct  portal_ioctl_hdr *) buf;
        int rc;
        
        printf("dumping opc %x to %s\n", opc, dump_filename);
@@ -132,17 +150,17 @@ dump(int dev_id, int opc, void *buf)
                return -EINVAL;
        }
        
-       rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
-       if (rc == 1)
-               rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
-       fclose(fp);
-       if (rc != 1) {
-               fprintf(stderr, "%s: %s\n", dump_filename, 
-                       strerror(errno));
-               return -EINVAL;
-       }
-       
-       return 0;
+        rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
+        if (rc == 1)
+                rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
+        fclose(fp);
+        if (rc != 1) {
+                fprintf(stderr, "%s: %s\n", dump_filename,
+                        strerror(errno));
+                return -EINVAL;
+        }
+
+        return 0;
 }
 
 /* register a device to send ioctls to.  */
@@ -184,16 +202,17 @@ set_ioctl_dump(char * file)
                free(dump_filename);
        
        dump_filename = strdup(file);
+        if (dump_filename == NULL)
+                abort();
+
+        set_ioc_handler(&dump);
        return 0;
 }
 
 int
 l_ioctl(int dev_id, int opc, void *buf)
 {
-       if (dump_filename) 
-               return dump(dev_id, opc, buf);
-       else 
-               return do_ioctl(dev_id, opc, buf);
+        return current_ioc_handler(dev_id, opc, buf);
 }
 
 /* Read an ioctl dump file, and call the ioc_func for each ioctl buffer
@@ -207,16 +226,28 @@ l_ioctl(int dev_id, int opc, void *buf)
 int 
 parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
 {
-       int fd, line =0;
+       int line =0;
        struct stat st;
-       char *buf, *end;
+       char *start, *buf, *end;
+#ifndef __CYGWIN__
+        int fd;
+#else
+        HANDLE fd, hmap;
+        DWORD size;
+#endif
        
+#ifndef __CYGWIN__
        fd = syscall(SYS_open, dump_file, O_RDONLY);
+        if (fd < 0) {
+                fprintf(stderr, "couldn't open %s: %s\n", dump_file, 
+                        strerror(errno));
+                exit(1);
+        }
 
 #ifndef SYS_fstat64
-#define __SYS_fstat__ SYS_fstat
+# define __SYS_fstat__ SYS_fstat
 #else
-#define __SYS_fstat__ SYS_fstat64
+# define __SYS_fstat__ SYS_fstat64
 #endif
        if (syscall(__SYS_fstat__, fd, &st)) { 
                perror("stat fails");
@@ -228,41 +259,72 @@ parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
                exit(1);
        }
 
-       buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
-       end = buf + st.st_size;
+       start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
+       end = start + st.st_size;
        close(fd);
-       while (buf < end) {
-               struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
-               struct portal_ioctl_hdr * data;
-               char tmp[8096];
-               int rc;
-               
-               line++;
+        if (start == MAP_FAILED) {
+               fprintf(stderr, "can't create file mapping\n");
+               exit(1);
+        }
+#else
+        fd = CreateFile(dump_file, GENERIC_READ, FILE_SHARE_READ, NULL,
+                        OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+        size = GetFileSize(fd, NULL);
+        if (size < 1) {
+               fprintf(stderr, "KML is empty\n");
+               exit(1);
+       }
 
-               data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
-               if (buf + data->ioc_len > end ) {
-                       fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
-                               data->ioc_len, end);
-                       return -1;
-               }
+        hmap = CreateFileMapping(fd, NULL, PAGE_READONLY, 0,0, NULL);
+        start = buf = MapViewOfFile(hmap, FILE_MAP_READ, 0, 0, 0);
+        end = buf + size;
+        CloseHandle(fd);
+        if (start == NULL) {
+               fprintf(stderr, "can't create file mapping\n");
+               exit(1);
+        }
+#endif /* __CYGWIN__ */
+
+       while (buf < end) {
+                struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
+                struct portal_ioctl_hdr * data;
+                char tmp[8096];
+                int rc;
+
+                line++;
+
+                data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
+                if (buf + data->ioc_len > end ) {
+                        fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
+                                data->ioc_len, end);
+                        return -1;
+                }
 #if 0
-               printf ("dump_hdr: %lx data: %lx\n",
-                       (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
-               
-               printf("%d: opcode %x len: %d  ver: %x ", line, dump_hdr->opc,
-                      data->ioc_len, data->ioc_version);
+                printf ("dump_hdr: %lx data: %lx\n",
+                        (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
+
+                printf("%d: opcode %x len: %d  ver: %x ", line, dump_hdr->opc,
+                       data->ioc_len, data->ioc_version);
 #endif
 
-               memcpy(tmp, data, data->ioc_len);
+                memcpy(tmp, data, data->ioc_len);
 
-               rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
-               if (rc) {
-                       printf("failed: %d\n", rc);
-                       exit(1);
-               }
+                rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
+                if (rc) {
+                        printf("failed: %d\n", rc);
+                        exit(1);
+                }
 
-               buf += data->ioc_len + sizeof(*dump_hdr);
+                buf += data->ioc_len + sizeof(*dump_hdr);
        }
+
+#ifndef __CYGWIN__
+        munmap(start, end - start);
+#else
+        UnmapViewOfFile(start);
+        CloseHandle(hmap);
+#endif
+
        return 0;
 }
 
index 3c7ec20..fb031ae 100644 (file)
 #include <stdarg.h>
 #include <asm/byteorder.h>
 
+#ifdef __CYGWIN__
+
+#include <netinet/in.h>
+
+#warning assuming little endian
+
+#define __cpu_to_le64(x) ((__u64)(x))
+#define __le64_to_cpu(x) ((__u64)(x))
+#define __cpu_to_le32(x) ((__u32)(x))
+#define __le32_to_cpu(x) ((__u32)(x))
+#define __cpu_to_le16(x) ((__u16)(x))
+#define __le16_to_cpu(x) ((__u16)(x))
+
+#endif /* __CYGWIN__ */
 #include <portals/api-support.h>
 #include <portals/ptlctl.h>
 #include <portals/list.h>
@@ -94,6 +109,9 @@ pcfg_ioctl(struct portals_cfg *pcfg)
                 PORTAL_IOC_INIT (data);
                 data.ioc_pbuf1   = (char*)pcfg;
                 data.ioc_plen1   = sizeof(*pcfg);
+                /* XXX liblustre hack XXX */
+                data.ioc_nal_cmd = pcfg->pcfg_command;
+                data.ioc_nid = pcfg->pcfg_nid;
 
                 rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
         }
index cbdcb10..54abc71 100644 (file)
@@ -8,13 +8,23 @@ tbd         Cluster File Systems, Inc. <info@clusterfs.com>
        - ptlrpcd can be blocked, stopping ALL progress (2477)
        - recovery for initial connections (2355)
        - fixes for mds_cleanup_orphans (1934)
+       - abort_recovery crashes MDS in b_eq (mds_unlink_orphan) (2584)
        - block all file creations until orphan recovery completes (1901)
        - client remove rq_connection from request struct (2423)
        - conf-sanity test_5, proper cleanup in umount log not availale (2640)
        - recovery timer race (2670)
        - mdc_close recovey bug (2532)
+       - ptlrpc cleanup bug (2710)
+       - mds timeout on local locks (2588)
+       - namespace lock held during RPCs (2431)
        - don't try to handle a message that hasn't been replied to (2699)
-       - don't fail assertion if in recovery during cleanup (2701)
+       - client assert failure during cleanup after abort recovery (2701)
+       - leak mdc device after failed mount (2712)
+       - ptlrpc_check_set allows timedout requests to complete (2714)
+       - wait for inflight reqs when ptlrpcd finishes (2710)
+       - make sure unregistered services are removed from the srv_list
+       - reset bulk XID's when resending them (caught by 1138 test)
+       - unregister_bulk after timeout
        - fix lconf error (2694)
        * miscellania
        - return LL_SUPER_MAGIC from statfs for the filesystem type (1972)
index 1582666..045bace 100644 (file)
@@ -12,7 +12,7 @@ DIRS24 = ptlbd
 endif
 
 if LIBLUSTRE
-SUBDIRS = portals obdclass lov ptlrpc obdecho  osc utils mdc lvfs #liblustre
+SUBDIRS = portals obdclass lov ptlrpc obdecho osc utils mdc lvfs liblustre
 else
 SUBDIRS = lvfs portals obdclass include $(DIRS24) mds utils obdfilter mdc osc ost 
 SUBDIRS+= llite obdecho lov cobd tests doc scripts conf ptlrpc
index 4107a0c..2f023db 100644 (file)
@@ -73,7 +73,7 @@ AC_OUTPUT([Makefile lvfs/Makefile portals/Makefile portals/Kernelenv \
          portals/knals/scimacnal/Makefile \
          portals/knals/ibnal/Makefile \
           portals/utils/Makefile portals/tests/Makefile portals/doc/Makefile \
-          obdecho/Makefile ptlrpc/Makefile liblustre/Makefile \
+          obdecho/Makefile ptlrpc/Makefile liblustre/Makefile liblustre/tests/Makefile \
          lov/Makefile osc/Makefile mdc/Makefile mds/Makefile ost/Makefile \
          cobd/Makefile ptlbd/Makefile conf/Makefile  tests/Makefile \
          utils/Makefile utils/Lustre/Makefile obdfilter/Makefile \
index 6c6ac1d..0b6da9f 100644 (file)
@@ -111,9 +111,9 @@ static inline void *kmalloc(int size, int prot)
 #define GFP_HIGHUSER 1
 #define GFP_ATOMIC 1
 #define GFP_NOFS 1
-#define IS_ERR(a) (((a) && abs((int)(a)) < 500) ? 1 : 0)
-#define PTR_ERR(a) ((int)(a))
-#define ERR_PTR(a) ((void*)(a))
+#define IS_ERR(a) (((a) && abs((long)(a)) < 500) ? 1 : 0)
+#define PTR_ERR(a) ((long)(a))
+#define ERR_PTR(a) ((void*)((long)(a)))
 
 #define capable(foo) 1
 #define CAP_SYS_ADMIN 1
@@ -415,6 +415,11 @@ static inline int kmem_cache_destroy(kmem_cache_t *a)
 #define PAGE_CACHE_SHIFT 12
 #define PAGE_CACHE_MASK PAGE_MASK
 
+/* XXX
+ * for this moment, liblusre will not rely OST for non-page-aligned write
+ */
+#define LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+
 struct page {
         void   *addr;
         unsigned long index;
@@ -424,6 +429,9 @@ struct page {
         /* internally used by liblustre file i/o */
         int     _offset;
         int     _count;
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+        int     _managed;
+#endif
 };
 
 #define kmap(page) (page)->addr
@@ -461,6 +469,7 @@ static inline void __free_pages(struct page *pg, int what)
 }
 
 #define __free_page(page) __free_pages((page), 0)
+#define free_page(page) __free_page(page)
 
 static inline struct page* __grab_cache_page(unsigned long index)
 {
@@ -706,6 +715,12 @@ static inline void del_timer(struct timer_list *l)
         free(l);
 }
 
+#define time_after(a, b)                                        \
+({                                                              \
+        printf("Error: inapproiate call time_after()\n");       \
+        1;                                                      \
+})
+
 typedef struct { volatile int counter; } atomic_t;
 
 #define atomic_read(a) ((a)->counter)
index 331e8f8..99c1785 100644 (file)
@@ -388,9 +388,6 @@ void ldlm_lock_remove_from_lru(struct ldlm_lock *);
 struct ldlm_lock *ldlm_handle2lock_ns(struct ldlm_namespace *,
                                       struct lustre_handle *);
 
-void *ldlm_put_lock_into_req(struct ptlrpc_request *,
-                                struct lustre_handle *, int);
-
 static inline struct ldlm_lock *ldlm_handle2lock(struct lustre_handle *h)
 {
         return __ldlm_handle2lock(h, 0);
index b949fe1..218807c 100644 (file)
@@ -68,19 +68,21 @@ struct obd_export {
         struct obd_uuid           exp_client_uuid;
         struct list_head          exp_obd_chain;
         struct obd_device        *exp_obd;
-        struct obd_import        *exp_imp_reverse;  /* to make rpc's backwards */
+        struct obd_import        *exp_imp_reverse; /* to make RPCs backwards */
         struct ptlrpc_connection *exp_connection;
         __u32                     exp_conn_cnt;
         struct ldlm_export_data   exp_ldlm_data;
-        struct ptlrpc_request    *exp_outstanding_reply;
+        struct list_head          exp_outstanding_replies;
         time_t                    exp_last_request_time;
         spinlock_t                exp_lock; /* protects flags int below */
-        int                       exp_failed:1;
+        /* ^ protects exp_outstanding_replies too */
         int                       exp_flags;
+        int                       exp_failed:1;
+        int                       exp_libclient:1; /* liblustre client? */
         union {
                 struct mds_export_data    eu_mds_data;
                 struct filter_export_data eu_filter_data;
-                struct ec_export_data     eu_ec_data;         
+                struct ec_export_data     eu_ec_data;
                 struct osc_export_data    eu_osc_data;
         } u;
 };
index 9428296..3fa0a61 100644 (file)
@@ -226,6 +226,7 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
 #define MSG_CONNECT_RECONNECT   0x2
 #define MSG_CONNECT_REPLAYABLE  0x4
 //#define MSG_CONNECT_PEER        0x8
+#define MSG_CONNECT_LIBCLIENT   0x10
 
 /*
  *   OST requests: OBDO & OBD request records
index bb8900e..03a011a 100644 (file)
 #include <linux/lustre_import.h>
 #include <linux/lprocfs_status.h>
 
+/* Size over which to OBD_VMALLOC() rather than OBD_ALLOC() service request
+ * buffers */
+#define SVC_BUF_VMALLOC_THRESHOLD (2*PAGE_SIZE)
+
 /* The following constants determine how much memory is devoted to
  * buffering in the lustre services.
  *
  * total memory = ?_NBUFS * ?_BUFSIZE
  *
  * ?_MAXREQSIZE         # maximum request service will receive
- * larger messages will get dropped.
+ * messages larger than ?_MAXREQSIZE are dropped.
  * request buffers are auto-unlinked when less than ?_MAXREQSIZE
  * is left in them.
  */
 
 #define LDLM_NUM_THREADS        min(smp_num_cpus * smp_num_cpus * 8, 64)
-#define LDLM_NEVENT_MAX 8192UL
-#define LDLM_NEVENTS    min_t(unsigned long, num_physpages / 64,  \
-                              LDLM_NEVENT_MAX)
 #define LDLM_NBUF_MAX   256UL
-#define LDLM_NBUFS      min(LDLM_NEVENTS / 16, LDLM_NBUF_MAX)
 #define LDLM_BUFSIZE    (8 * 1024)
 #define LDLM_MAXREQSIZE (5 * 1024)
+#define LDLM_MAXMEM      (num_physpages*(PAGE_SIZE/1024))
+#define LDLM_NBUFS       min(LDLM_MAXMEM/LDLM_BUFSIZE, LDLM_NBUF_MAX)
 
 #define MDT_MAX_THREADS 32UL
 #define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
                                   MDT_MAX_THREADS), 2UL)
-#define MDS_NEVENT_MAX  8192UL
-#define MDS_NEVENTS     min_t(unsigned long, num_physpages / 64, \
-                              MDS_NEVENT_MAX)
 #define MDS_NBUF_MAX    512UL
-#define MDS_NBUFS       min(MDS_NEVENTS / 16, MDS_NBUF_MAX)
 #define MDS_BUFSIZE     (8 * 1024)
 /* Assume file name length = FNAME_MAX = 256 (true for extN).
  *        path name length = PATH_MAX = 4096
  * except in the open case where there are a large number of OSTs in a LOV.
  */
 #define MDS_MAXREQSIZE  (5 * 1024)
+#define MDS_MAXMEM      (num_physpages*(PAGE_SIZE/512))
+#define MDS_NBUFS       min(MDS_MAXMEM/MDS_BUFSIZE, MDS_NBUF_MAX)
 
 #define OST_MAX_THREADS 36UL
 #define OST_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
                                   OST_MAX_THREADS), 2UL)
-#define OST_NEVENT_MAX  16384UL
-#define OST_NEVENTS     min_t(unsigned long, num_physpages / 16, \
-                              OST_NEVENT_MAX)
 #define OST_NBUF_MAX    5000UL
-#define OST_NBUFS       min(OST_NEVENTS / 2, OST_NBUF_MAX)
 #define OST_BUFSIZE     (8 * 1024)
 /* OST_MAXREQSIZE ~= 1640 bytes =
  * lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote
  * - OST_MAXREQSIZE must be at least 1 page of cookies plus some spillover
  */
 #define OST_MAXREQSIZE  (5 * 1024)
+#define OST_MAXMEM      (num_physpages*(PAGE_SIZE/512))
+#define OST_NBUFS       min(OST_MAXMEM/OST_BUFSIZE, OST_NBUF_MAX)
 
 #define PTLBD_NUM_THREADS        4
-#define PTLBD_NEVENTS    1024
 #define PTLBD_NBUFS      20
 #define PTLBD_BUFSIZE    (32 * 1024)
 #define PTLBD_MAXREQSIZE 1024
@@ -198,21 +195,66 @@ struct ptlrpc_request_set {
 
 struct ptlrpc_bulk_desc;
 
+/*
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+        void   (*cbid_fn)(ptl_event_t *ev);     /* specific callback fn */
+        void    *cbid_arg;                      /* additional arg */
+};
+
+#define RS_MAX_LOCKS 4
+#define RS_DEBUG     1
+
+struct ptlrpc_reply_state {
+        struct ptlrpc_cb_id   rs_cb_id;
+        struct list_head      rs_list;
+        struct list_head      rs_exp_list;
+        struct list_head      rs_obd_list;
+#if RS_DEBUG
+        struct list_head      rs_debug_list;
+#endif
+        /* updates to following flag serialised by srv_request_lock */
+        unsigned int          rs_difficult:1;   /* ACK/commit stuff */
+        unsigned int          rs_scheduled:1;   /* being handled? */
+        unsigned int          rs_scheduled_ever:1; /* any schedule attempts? */
+        unsigned int          rs_handled:1;     /* been handled yet? */
+        unsigned int          rs_on_net:1;      /* reply_out_callback pending? */
+
+        int                   rs_size;
+        __u64                 rs_transno;
+        __u64                 rs_xid;
+        struct obd_export    *rs_export;
+        struct ptlrpc_srv_ni *rs_srv_ni;
+        ptl_handle_md_t       rs_md_h;
+
+        /* locks awaiting client reply ACK */
+        int                   rs_nlocks;
+        struct lustre_handle  rs_locks[RS_MAX_LOCKS];
+        ldlm_mode_t           rs_modes[RS_MAX_LOCKS];
+        /* last member: variable sized reply message */
+        struct lustre_msg     rs_msg;
+};
+
 struct ptlrpc_request {
         int rq_type; /* one of PTL_RPC_MSG_* */
         struct list_head rq_list;
         int rq_status;
         spinlock_t rq_lock;
-        unsigned int rq_intr:1, rq_replied:1, rq_want_ack:1, rq_err:1,
+        /* client-side flags */
+        unsigned int rq_intr:1, rq_replied:1, rq_err:1,
             rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1,
-            rq_no_resend:1, rq_resent:1, rq_waiting:1, rq_receiving_reply:1;
+            rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1;
         int rq_phase;
-                
+        /* client-side refcount for SENT race */
         atomic_t rq_refcount;
 
         int rq_request_portal; /* XXX FIXME bug 249 */
         int rq_reply_portal; /* XXX FIXME bug 249 */
 
+        /* client-side # reply bytes actually received  */
+        int rq_nob_received;
+
         int rq_reqlen;
         struct lustre_msg *rq_reqmsg;
 
@@ -230,20 +272,25 @@ struct ptlrpc_request {
 
         int rq_import_generation;
         enum lustre_imp_state rq_send_state;
-        wait_queue_head_t rq_reply_waitq; /* XXX also _for_ack */
 
-        /* incoming reply */
-        ptl_md_t rq_reply_md;
-        ptl_handle_md_t rq_reply_md_h;
-
-        /* outgoing req/rep */
-        ptl_md_t rq_req_md;
+        /* client+server request */
+        ptl_handle_md_t      rq_req_md_h;
+        struct ptlrpc_cb_id  rq_req_cbid;
 
+        /* server-side... */
+        struct timeval                     rq_arrival_time; /* request arrival time */
+        struct ptlrpc_reply_state         *rq_reply_state; /* separated reply state */
+        struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer */
+        
+        /* client-only incoming reply */
+        ptl_handle_md_t      rq_reply_md_h;
+        wait_queue_head_t    rq_reply_waitq;
+        struct ptlrpc_cb_id  rq_reply_cbid;
+        
         struct ptlrpc_peer rq_peer; /* XXX see service.c can this be factored away? */
         struct obd_export *rq_export;
         struct obd_import *rq_import;
-        struct ptlrpc_service *rq_svc;
-
+        
         void (*rq_replay_cb)(struct ptlrpc_request *);
         void (*rq_commit_cb)(struct ptlrpc_request *);
         void  *rq_cb_data;
@@ -256,17 +303,11 @@ struct ptlrpc_request {
         struct ptlrpc_request_set *rq_set;
         void *rq_interpret_reply;               /* Async completion handler */
         union ptlrpc_async_args rq_async_args;  /* Async completion context */
-
-        /* Only used on the server side for tracking acks. */
-        struct ptlrpc_req_ack_lock {
-                struct lustre_handle lock;
-                __u32                mode;
-        } rq_ack_locks[REQ_MAX_ACK_LOCKS];
 };
 
 
 #define RQ_PHASE_NEW           0xebc0de00
-#define RQ_PHASE_RPC          0xebc0de01
+#define RQ_PHASE_RPC           0xebc0de01
 #define RQ_PHASE_BULK          0xebc0de02
 #define RQ_PHASE_INTERPRET     0xebc0de03
 #define RQ_PHASE_COMPLETE      0xebc0de04
@@ -276,20 +317,19 @@ struct ptlrpc_request {
 
 #define PTLRPC_REQUEST_COMPLETE(req) ((req)->rq_phase > RQ_PHASE_RPC)
 
-#define DEBUG_REQ_FLAGS(req)                                                   \
-        ((req->rq_phase == RQ_PHASE_NEW) ? "New" :                             \
-         (req->rq_phase == RQ_PHASE_RPC) ? "RPC" :                             \
-         (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" :                 \
-         (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" :                   \
-         (req->rq_phase == RQ_PHASE_BULK) ? "Bulk" : "?phase?"),               \
-        FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"),                   \
-        FLAG(req->rq_want_ack, "A"), FLAG(req->rq_err, "E"),                   \
-        FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"),  \
-        FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),                 \
-        FLAG(req->rq_no_resend, "N"), FLAG(req->rq_resent, "s"),               \
+#define DEBUG_REQ_FLAGS(req)                                                    \
+        ((req->rq_phase == RQ_PHASE_NEW) ? "New" :                              \
+         (req->rq_phase == RQ_PHASE_RPC) ? "Rpc" :                              \
+         (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" :                  \
+         (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : "?phase?"),        \
+        FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"),                    \
+        FLAG(req->rq_err, "E"),                                                 \
+        FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"),   \
+        FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),                  \
+        FLAG(req->rq_no_resend, "N"),                                           \
         FLAG(req->rq_waiting, "W")
 
-#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s"
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s"
 
 #define DEBUG_REQ(level, req, fmt, args...)                                    \
 do {                                                                           \
@@ -312,20 +352,19 @@ CDEBUG(level, "@@@ " fmt                                                       \
 } while (0)
 
 struct ptlrpc_bulk_page {
-        struct ptlrpc_bulk_desc *bp_desc;
         struct list_head bp_link;
         int bp_buflen;
         int bp_pageoffset;                      /* offset within a page */
         struct page *bp_page;
 };
 
-#define BULK_GET_SOURCE          0
+#define BULK_GET_SOURCE   0
 #define BULK_PUT_SINK     1
 #define BULK_GET_SINK     2
 #define BULK_PUT_SOURCE   3
 
 struct ptlrpc_bulk_desc {
-        unsigned int bd_complete:1;
+        unsigned int bd_success:1;              /* completed successfully */
         unsigned int bd_network_rw:1;           /* accessible to the network */
         unsigned int bd_type:2;                 /* {put,get}{source,sink} */
         unsigned int bd_registered:1;           /* client side */
@@ -335,17 +374,17 @@ struct ptlrpc_bulk_desc {
         struct obd_import *bd_import;
         __u32 bd_portal;
         struct ptlrpc_request *bd_req;          /* associated request */
-        wait_queue_head_t bd_waitq;             /* server side only WQ */
-        struct list_head bd_page_list;
-        __u32 bd_page_count;
-        __u32 bd_last_xid;
-        
-        ptl_md_t bd_md;
-        ptl_handle_md_t bd_md_h;
-        ptl_handle_me_t bd_me_h;
+        wait_queue_head_t      bd_waitq;        /* server side only WQ */
+        int                    bd_page_count;   /* # pages (== entries in bd_iov) */
+        int                    bd_max_pages;    /* allocated size of bd_iov */
+        int                    bd_nob;          /* # bytes covered */
+        int                    bd_nob_transferred; /* # bytes GOT/PUT */
 
-        int bd_callback_count;                  /* server side callbacks */
+        __u64                  bd_last_xid;
 
+        struct ptlrpc_cb_id    bd_cbid;         /* network callback info */
+        ptl_handle_md_t        bd_md_h;         /* associated MD */
+        
 #ifdef __KERNEL__
         ptl_kiov_t bd_iov[PTL_MD_MAX_IOV];
 #else
@@ -363,9 +402,12 @@ struct ptlrpc_thread {
 struct ptlrpc_request_buffer_desc {
         struct list_head       rqbd_list;
         struct ptlrpc_srv_ni  *rqbd_srv_ni;
-        ptl_handle_me_t        rqbd_me_h;
-        atomic_t               rqbd_refcount;
+        ptl_handle_md_t        rqbd_md_h;
+        int                    rqbd_refcount;
+        int                    rqbd_eventcount;
         char                  *rqbd_buffer;
+        struct ptlrpc_cb_id    rqbd_cbid;
+        struct ptlrpc_request  rqbd_req;
 };
 
 /* event queues are per-ni, because one day we may get a hardware
@@ -376,57 +418,64 @@ struct ptlrpc_ni { /* Generic interface state */
         char                   *pni_name;
         int                     pni_number;
         ptl_handle_ni_t         pni_ni_h;
-        ptl_handle_eq_t         pni_request_out_eq_h;
-        ptl_handle_eq_t         pni_reply_in_eq_h;
-        ptl_handle_eq_t         pni_reply_out_eq_h;
-        ptl_handle_eq_t         pni_bulk_put_source_eq_h;
-        ptl_handle_eq_t         pni_bulk_put_sink_eq_h;
-        ptl_handle_eq_t         pni_bulk_get_source_eq_h;
-        ptl_handle_eq_t         pni_bulk_get_sink_eq_h;
+        ptl_handle_eq_t         pni_eq_h;
 };
 
 struct ptlrpc_srv_ni {
         /* Interface-specific service state */
         struct ptlrpc_service  *sni_service;    /* owning service */
         struct ptlrpc_ni       *sni_ni;         /* network interface */
-        ptl_handle_eq_t         sni_eq_h;       /* event queue handle */
-        struct list_head        sni_rqbds;      /* all the request buffer descriptors */
-        __u32                   sni_nrqbds;     /* # request buffers */
-        atomic_t                sni_nrqbds_receiving; /* # request buffers posted */
+        struct list_head        sni_rqbds;      /* all the request buffers */
+        struct list_head        sni_active_replies; /* all the active replies */
+        int                     sni_nrqbd_receiving; /* # posted request buffers */
 };
 
-struct ptlrpc_service {
-        time_t srv_time;
-        time_t srv_timeout;
-
-        struct list_head srv_ni_list;          /* list of interfaces */
-        __u32            srv_max_req_size;     /* biggest request to receive */
-        __u32            srv_buf_size;         /* # bytes in a request buffer */
+typedef int (*svc_handler_t)(struct ptlrpc_request *req);
 
+struct ptlrpc_service {
+        struct list_head srv_list;              /* chain thru all services */
+        int              srv_max_req_size;      /* biggest request to receive */
+        int              srv_buf_size;          /* size of individual buffers */
+        int              srv_nbufs;             /* total # req buffer descs allocated */
+        int              srv_nthreads;          /* # running threads */
+        int              srv_n_difficult_replies; /* # 'difficult' replies */
+        int              srv_n_active_reqs;     /* # reqs being served */
+        
         __u32 srv_req_portal;
         __u32 srv_rep_portal;
 
-        __u32 srv_xid;
+        int               srv_n_queued_reqs;    /* # reqs waiting to be served */
+        struct list_head  srv_request_queue;    /* reqs waiting for service */
+
+        atomic_t          srv_outstanding_replies;
+        struct list_head  srv_reply_queue;      /* replies waiting for service */
 
         wait_queue_head_t srv_waitq; /* all threads sleep on this */
 
-        spinlock_t srv_lock;
-        struct list_head srv_threads;
-        int (*srv_handler)(struct ptlrpc_request *req);
+        struct list_head   srv_threads;
+        struct obd_device *srv_obddev;
+        svc_handler_t      srv_handler;
+        
         char *srv_name;  /* only statically allocated strings here; we don't clean them */
-        struct proc_dir_entry *srv_procroot;
-        struct lprocfs_stats  *srv_stats;
 
-        int                  srv_interface_rover;
+        spinlock_t               srv_lock;
+
+        struct proc_dir_entry   *srv_procroot;
+        struct lprocfs_stats    *srv_stats;
+        
         struct ptlrpc_srv_ni srv_interfaces[0];
 };
 
-typedef int (*svc_handler_t)(struct ptlrpc_request *req);
-
 /* ptlrpc/events.c */
 extern struct ptlrpc_ni ptlrpc_interfaces[];
 extern int              ptlrpc_ninterfaces;
 extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, struct ptlrpc_peer *peer);
+extern void request_out_callback (ptl_event_t *ev);
+extern void reply_in_callback(ptl_event_t *ev);
+extern void client_bulk_callback (ptl_event_t *ev);
+extern void request_in_callback(ptl_event_t *ev);
+extern void reply_out_callback(ptl_event_t *ev);
+extern void server_bulk_callback (ptl_event_t *ev);
 
 /* ptlrpc/connection.c */
 void ptlrpc_dump_connections(void);
@@ -439,28 +488,28 @@ void ptlrpc_init_connection(void);
 void ptlrpc_cleanup_connection(void);
 
 /* ptlrpc/niobuf.c */
-int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *);
-int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *);
-void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk);
+int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc);
+void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc);
 int ptlrpc_register_bulk(struct ptlrpc_request *req);
 void ptlrpc_unregister_bulk (struct ptlrpc_request *req);
 
-static inline int ptlrpc_bulk_complete (struct ptlrpc_bulk_desc *desc) 
+static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc) 
 {
         unsigned long flags;
         int           rc;
 
         spin_lock_irqsave (&desc->bd_lock, flags);
-        rc = desc->bd_complete;
+        rc = desc->bd_network_rw;
         spin_unlock_irqrestore (&desc->bd_lock, flags);
         return (rc);
 }
 
+int ptlrpc_send_reply(struct ptlrpc_request *req, int);
 int ptlrpc_reply(struct ptlrpc_request *req);
 int ptlrpc_error(struct ptlrpc_request *req);
 void ptlrpc_resend_req(struct ptlrpc_request *request);
 int ptl_send_rpc(struct ptlrpc_request *request);
-void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd);
+void ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd);
 
 /* ptlrpc/client.c */
 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
@@ -468,6 +517,39 @@ void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
 void ptlrpc_cleanup_client(struct obd_import *imp);
 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
 
+static inline int
+ptlrpc_client_receiving_reply (struct ptlrpc_request *req)
+{
+        unsigned long flags;
+        int           rc;
+        
+        spin_lock_irqsave(&req->rq_lock, flags);
+        rc = req->rq_receiving_reply;
+        spin_unlock_irqrestore(&req->rq_lock, flags);
+        return (rc);
+}
+
+static inline int
+ptlrpc_client_replied (struct ptlrpc_request *req)
+{
+        unsigned long flags;
+        int           rc;
+        
+        spin_lock_irqsave(&req->rq_lock, flags);
+        rc = req->rq_replied;
+        spin_unlock_irqrestore(&req->rq_lock, flags);
+        return (rc);
+}
+
+static inline void
+ptlrpc_wake_client_req (struct ptlrpc_request *req)
+{
+        if (req->rq_set == NULL)
+                wake_up(&req->rq_reply_waitq);
+        else
+                wake_up(&req->rq_set->set_waitq);
+}
+
 int ptlrpc_queue_wait(struct ptlrpc_request *req);
 int ptlrpc_replay_req(struct ptlrpc_request *req);
 void ptlrpc_unregister_reply(struct ptlrpc_request *req);
@@ -493,28 +575,32 @@ void ptlrpc_req_finished(struct ptlrpc_request *request);
 void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
-                                               int type, int portal);
+                                               int npages, int type, int portal);
 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
-                                              int type, int portal);
+                                              int npages, int type, int portal);
 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk);
-int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
-                          struct page *page, int pageoffset, int len);
-void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *page);
+void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+                           struct page *page, int pageoffset, int len);
 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
                                       struct obd_import *imp);
 __u64 ptlrpc_next_xid(void);
 
 /* ptlrpc/service.c */
-struct ptlrpc_service *
-ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size,
-                int req_portal, int rep_portal, svc_handler_t, char *name,
-                struct proc_dir_entry *proc_entry);
+void ptlrpc_save_lock (struct ptlrpc_request *req, 
+                       struct lustre_handle *lock, int mode);
+void ptlrpc_commit_replies (struct obd_device *obd);
+void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs);
+struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
+                                       int req_portal, int rep_portal, 
+                                       svc_handler_t, char *name,
+                                       struct proc_dir_entry *proc_entry);
 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
 int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc,
                            int cnt, char *base_name);
 int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
                         char *name);
 int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services (void *arg);
 
 struct ptlrpc_svc_data {
         char *name;
@@ -535,6 +621,7 @@ int lustre_pack_request(struct ptlrpc_request *, int count, int *lens,
                         char **bufs);
 int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens,
                       char **bufs);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
 int lustre_msg_size(int count, int *lengths);
 int lustre_unpack_msg(struct lustre_msg *m, int len);
 void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
@@ -571,7 +658,6 @@ void ptlrpc_lprocfs_unregister_obd(struct obd_device *obddev);
 #endif
 
 /* ptlrpc/llog_server.c */
-struct llog_obd_ctxt;
 int llog_origin_handle_create(struct ptlrpc_request *req);
 int llog_origin_handle_next_block(struct ptlrpc_request *req);
 int llog_origin_handle_read_header(struct ptlrpc_request *req);
index 619010b..ec90c84 100644 (file)
@@ -480,7 +480,8 @@ struct obd_device {
         int                              obd_replayed_requests;
         int                              obd_requests_queued_for_recovery;
         wait_queue_head_t                obd_next_transno_waitq;
-        wait_queue_head_t                obd_commit_waitq;
+        struct list_head                 obd_uncommitted_replies;
+        spinlock_t                       obd_uncommitted_replies_lock;
         struct timer_list                obd_recovery_timer;
         struct list_head                 obd_recovery_queue;
         struct list_head                 obd_delayed_reply_queue;
@@ -666,7 +667,7 @@ static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno,
                obd->obd_name, transno);
         if (transno > obd->obd_last_committed) {
                 obd->obd_last_committed = transno;
-                wake_up(&obd->obd_commit_waitq);
+                ptlrpc_commit_replies (obd);
         }
 }
 
index 6293972..ff8d63b 100644 (file)
 +++ 25/arch/parisc/lib/checksum.c      2003-10-05 00:33:23.000000000 -0700
 @@ -16,8 +16,10 @@
   *
-  * $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+  * $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
   */
 -#include <net/checksum.h>
 +#include <linux/module.h>
 --- linux-2.6.0-test6/drivers/char/ftape/compressor/zftape-compress.c  2003-06-14 12:18:32.000000000 -0700
 +++ 25/drivers/char/ftape/compressor/zftape-compress.c 2003-10-05 00:33:24.000000000 -0700
 @@ -31,6 +31,7 @@
-  char zftc_rev[] = "$Revision: 1.3 $";
-  char zftc_dat[] = "$Date: 2003/12/03 05:12:20 $";
+  char zftc_rev[] = "$Revision: 1.4 $";
+  char zftc_dat[] = "$Date: 2004/02/14 03:14:33 $";
  
 +#include <linux/version.h>
  #include <linux/errno.h>
 --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/divamnt.c    2003-09-27 18:57:44.000000000 -0700
 +++ 25/drivers/isdn/hardware/eicon/divamnt.c   2003-10-05 00:33:24.000000000 -0700
 @@ -1,4 +1,4 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
   *
   * Driver for Eicon DIVA Server ISDN cards.
   * Maint module
 -#include "di_defs.h"
  #include "debug_if.h"
  
--static char *main_revision = "$Revision: 1.3 $";
-+static char *main_revision = "$Revision: 1.3 $";
+-static char *main_revision = "$Revision: 1.4 $";
++static char *main_revision = "$Revision: 1.4 $";
  
  static int major;
  
 --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/divasmain.c  2003-09-27 18:57:44.000000000 -0700
 +++ 25/drivers/isdn/hardware/eicon/divasmain.c 2003-10-05 00:33:24.000000000 -0700
 @@ -1,4 +1,4 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
   *
   * Low level driver for Eicon DIVA Server ISDN cards.
   *
  #include "diva_dma.h"
  #include "diva_pci.h"
  
--static char *main_revision = "$Revision: 1.3 $";
-+static char *main_revision = "$Revision: 1.3 $";
+-static char *main_revision = "$Revision: 1.4 $";
++static char *main_revision = "$Revision: 1.4 $";
  
  static int major;
  
 --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/dqueue.c     2003-06-14 12:18:22.000000000 -0700
 +++ 25/drivers/isdn/hardware/eicon/dqueue.c    2003-10-05 00:33:24.000000000 -0700
 @@ -1,10 +1,10 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
   *
   * Driver for Eicon DIVA Server ISDN cards.
   * User Mode IDI Interface
 --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/mntfunc.c    2003-09-27 18:57:44.000000000 -0700
 +++ 25/drivers/isdn/hardware/eicon/mntfunc.c   2003-10-05 00:33:24.000000000 -0700
 @@ -1,4 +1,4 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
   *
   * Driver for Eicon DIVA Server ISDN cards.
   * Maint module
 --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/os_capi.h    2003-06-14 12:18:25.000000000 -0700
 +++ 25/drivers/isdn/hardware/eicon/os_capi.h   2003-10-05 00:33:24.000000000 -0700
 @@ -1,10 +1,10 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
   *
   * ISDN interface module for Eicon active cards DIVA.
   * CAPI Interface OS include files 
 --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/platform.h   2003-09-27 18:57:44.000000000 -0700
 +++ 25/drivers/isdn/hardware/eicon/platform.h  2003-10-05 00:33:24.000000000 -0700
 @@ -1,4 +1,4 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
   *
   * platform.h
   * 
 +++ 25/drivers/media/video/planb.c     2003-10-05 00:33:24.000000000 -0700
 @@ -27,7 +27,6 @@
  
- /* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ */
+ /* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ */
  
 -#include <linux/version.h>
  #include <linux/init.h>
 --- linux-2.6.0-test6/drivers/mtd/chips/map_rom.c      2003-06-14 12:18:24.000000000 -0700
 +++ 25/drivers/mtd/chips/map_rom.c     2003-10-05 00:33:24.000000000 -0700
 @@ -4,7 +4,6 @@
-  * $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+  * $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
   */
  
 -#include <linux/version.h>
  #include <linux/hdlc.h>
  
  /* Version */
--static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ for Linux\n";
-+static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ for Linux\n";
+-static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ for Linux\n";
++static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ for Linux\n";
  static int debug;
  static int quartz;
  
index 5411d9c..54d1f68 100644 (file)
@@ -1,4 +1,4 @@
-$Id: bproc-patch-2.4.20,v 1.3 2003/12/03 05:12:25 phil Exp $
+$Id: bproc-patch-2.4.20,v 1.4 2004/02/14 03:14:37 rread Exp $
 
 Index: linux/fs/exec.c
 ===================================================================
@@ -764,7 +764,7 @@ Index: linux/kernel/bproc_hook.c
 + *  along with this program; if not, write to the Free Software
 + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 + *
-+ * $Id: bproc-patch-2.4.20,v 1.3 2003/12/03 05:12:25 phil Exp $
++ * $Id: bproc-patch-2.4.20,v 1.4 2004/02/14 03:14:37 rread Exp $
 + *-----------------------------------------------------------------------*/
 +#include <linux/kernel.h>
 +#include <linux/sched.h>
@@ -832,7 +832,7 @@ Index: linux/include/linux/bproc.h
 + *  along with this program; if not, write to the Free Software
 + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 + *
-+ * $Id: bproc-patch-2.4.20,v 1.3 2003/12/03 05:12:25 phil Exp $
++ * $Id: bproc-patch-2.4.20,v 1.4 2004/02/14 03:14:37 rread Exp $
 + *-----------------------------------------------------------------------*/
 +#ifndef _LINUX_BPROC_H
 +#define _LINUX_BPROC_H
index 05fcf61..818596c 100644 (file)
@@ -1,7 +1,7 @@
 Index: linux-2.4.20/fs/ext3/xattr.c
 ===================================================================
---- linux-2.4.20.orig/fs/ext3/xattr.c  2003-11-13 17:14:52.000000000 +0300
-+++ linux-2.4.20/fs/ext3/xattr.c       2003-11-21 16:43:48.000000000 +0300
+--- linux-2.4.20.orig/fs/ext3/xattr.c  2003-11-13 10:59:33.000000000 +0800
++++ linux-2.4.20/fs/ext3/xattr.c       2003-11-25 21:16:51.000000000 +0800
 @@ -1293,9 +1293,10 @@
                                goto cleanup;
                        memcpy(header, HDR(bh), bh->b_size);
index 768f778..0da12fc 100644 (file)
@@ -13,6 +13,7 @@ if LIBLUSTRE
 lib_LIBRARIES = libldlm.a
 libldlm_a_SOURCES = l_lock.c ldlm_lock.c ldlm_resource.c ldlm_lib.c \
   ldlm_plain.c ldlm_extent.c ldlm_request.c ldlm_lockd.c ldlm_internal.h
+libldlm_a_CFLAGS = -fPIC
 endif
 
 include $(top_srcdir)/Rules
index 4b7eb3b..d1e2b49 100644 (file)
@@ -497,6 +497,10 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         export = req->rq_export = class_conn2export(&conn);
         LASSERT(export != NULL);
 
+        /* request from liblustre? */
+        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT)
+                export->exp_libclient = 1;
+
         if (export->exp_connection != NULL)
                 ptlrpc_put_connection(export->exp_connection);
         export->exp_connection = ptlrpc_get_connection(&req->rq_peer,
@@ -888,6 +892,8 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc)
         int recovery_done = 0;
         int rc2;
 
+        LASSERT ((rc == 0) == (req->rq_reply_state != NULL));
+
         if (rc) {
                 /* Just like ptlrpc_error, but without the sending. */
                 rc = lustre_pack_reply(req, 0, NULL, NULL);
@@ -895,6 +901,7 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc)
                 req->rq_type = PTL_RPC_MSG_ERR;
         }
 
+        LASSERT (!req->rq_reply_state->rs_difficult);
         LASSERT(list_empty(&req->rq_list));
         /* XXX a bit like the request-dup code in queue_recovery_request */
         OBD_ALLOC(saved_req, sizeof *saved_req);
@@ -905,6 +912,8 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc)
                 LBUG();
         memcpy(saved_req, req, sizeof *saved_req);
         memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+        /* the copied req takes over the reply state */
+        req->rq_reply_state = NULL;
         req = saved_req;
         req->rq_reqmsg = reqmsg;
         class_export_get(req->rq_export);
@@ -954,180 +963,131 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc)
         return 1;
 }
 
-static void ptlrpc_abort_reply (struct ptlrpc_request *req)
-{
-        /* On return, we must be sure that the ACK callback has either
-         * happened or will not happen.  Note that the SENT callback will
-         * happen come what may since we successfully posted the PUT. */
-        int rc;
-        struct l_wait_info lwi;
-        unsigned long flags;
-
- again:
-        /* serialise with ACK callback */
-        spin_lock_irqsave (&req->rq_lock, flags);
-        if (!req->rq_want_ack) {
-                spin_unlock_irqrestore (&req->rq_lock, flags);
-                /* The ACK callback has happened already.  Although the
-                 * SENT callback might still be outstanding (yes really) we
-                 * don't care; this is just like normal completion. */
-                return;
-        }
-        spin_unlock_irqrestore (&req->rq_lock, flags);
-
-        /* Have a bash at unlinking the MD.  This will fail until the SENT
-         * callback has happened since the MD is busy from the PUT.  If the
-         * ACK still hasn't arrived after then, a successful unlink will
-         * ensure the ACK callback never happens. */
-        rc = PtlMDUnlink (req->rq_reply_md_h);
-        switch (rc) {
-        default:
-                LBUG ();
-        case PTL_OK:
-                /* SENT callback happened; ACK callback preempted */
-                LASSERT (req->rq_want_ack);
-                spin_lock_irqsave (&req->rq_lock, flags);
-                req->rq_want_ack = 0;
-                spin_unlock_irqrestore (&req->rq_lock, flags);
-                return;
-        case PTL_INV_MD:
-                return;
-        case PTL_MD_INUSE:
-                /* Still sending or ACK callback in progress: wait until
-                 * either callback has completed and try again.
-                 * Actually we can't wait for the SENT callback because
-                 * there's no state the SENT callback can touch that will
-                 * allow it to communicate with us!  So we just wait here
-                 * for a short time, effectively polling for the SENT
-                 * callback by calling PtlMDUnlink() again, to see if it
-                 * has finished.  Note that if the ACK does arrive, its
-                 * callback wakes us in short order. --eeb */
-                lwi = LWI_TIMEOUT (HZ/4, NULL, NULL);
-                rc = l_wait_event(req->rq_reply_waitq, !req->rq_want_ack,
-                                  &lwi);
-                CDEBUG (D_HA, "Retrying req %p: %d\n", req, rc);
-                /* NB go back and test rq_want_ack with locking, to ensure
-                 * if ACK callback happened, it has completed stopped
-                 * referencing this req. */
-                goto again;
-        }
-}
-
-void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+int
+target_send_reply_msg (struct ptlrpc_request *req, int rc, int fail_id)
 {
-        int i;
-        int netrc;
-        unsigned long flags;
-        struct ptlrpc_req_ack_lock *ack_lock;
-        struct l_wait_info lwi = { 0 };
-        wait_queue_t commit_wait;
-        struct obd_device *obd =
-                req->rq_export ? req->rq_export->exp_obd : NULL;
-        struct obd_export *exp = NULL;
-
-        if (req->rq_export) {
-                for (i = 0; i < REQ_MAX_ACK_LOCKS; i++) {
-                        if (req->rq_ack_locks[i].mode) {
-                                exp = req->rq_export;
-                                break;
+        if (OBD_FAIL_CHECK(fail_id | OBD_FAIL_ONCE)) {
+                obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
+                DEBUG_REQ(D_ERROR, req, "dropping reply");
+                /* NB this does _not_ send with ACK disabled, to simulate
+                 * sending OK, but timing out for the ACK */
+                if (req->rq_reply_state != NULL) {
+                        if (!req->rq_reply_state->rs_difficult) {
+                                lustre_free_reply_state (req->rq_reply_state);
+                                req->rq_reply_state = NULL;
+                        } else {
+                                struct ptlrpc_service *svc =
+                                        req->rq_rqbd->rqbd_srv_ni->sni_service;
+                                atomic_inc(&svc->srv_outstanding_replies);
                         }
                 }
+                return (-ECOMM);
         }
 
-        if (exp) {
-                exp->exp_outstanding_reply = req;
-                spin_lock_irqsave (&req->rq_lock, flags);
-                req->rq_want_ack = 1;
-                spin_unlock_irqrestore (&req->rq_lock, flags);
-        }
-
-        if (!OBD_FAIL_CHECK(fail_id | OBD_FAIL_ONCE)) {
-                if (rc == 0) {
-                        DEBUG_REQ(D_NET, req, "sending reply");
-                        netrc = ptlrpc_reply(req);
-                } else if (rc == -ENOTCONN) {
-                        DEBUG_REQ(D_HA, req, "processing error (%d)", rc);
-                        netrc = ptlrpc_error(req);
-                } else {
-                        DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc);
-                        netrc = ptlrpc_error(req);
+        if (rc) {
+                DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc);
+                if (req->rq_reply_state == NULL) {
+                        rc = lustre_pack_reply (req, 0, NULL, NULL);
+                        if (rc != 0) {
+                                CERROR ("can't allocate reply\n");
+                                return (rc);
+                        }
                 }
+                req->rq_type = PTL_RPC_MSG_ERR;
         } else {
-                obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
-                DEBUG_REQ(D_ERROR, req, "dropping reply");
-                if (req->rq_repmsg) {
-                        OBD_FREE(req->rq_repmsg, req->rq_replen);
-                        req->rq_repmsg = NULL;
-                }
-                init_waitqueue_head(&req->rq_reply_waitq);
-                netrc = 0;
+                DEBUG_REQ(D_NET, req, "sending reply");
         }
+        
+        return (ptlrpc_send_reply(req, 1));
+}
 
-        /* a failed send simulates the callbacks */
-        LASSERT(netrc == 0 || req->rq_want_ack == 0);
-        if (exp == NULL) {
-                LASSERT(req->rq_want_ack == 0);
+void 
+target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+        int                        netrc;
+        unsigned long              flags;
+        struct ptlrpc_reply_state *rs;
+        struct obd_device         *obd;
+        struct obd_export         *exp;
+        struct ptlrpc_srv_ni      *sni;
+        struct ptlrpc_service     *svc;
+
+        sni = req->rq_rqbd->rqbd_srv_ni;
+        svc = sni->sni_service;
+        
+        rs = req->rq_reply_state;
+        if (rs == NULL || !rs->rs_difficult) {
+                /* The easy case; no notifiers and reply_out_callback()
+                 * cleans up (i.e. we can't look inside rs after a
+                 * successful send) */
+                netrc = target_send_reply_msg (req, rc, fail_id);
+
+                LASSERT (netrc == 0 || req->rq_reply_state == NULL);
                 return;
         }
-        LASSERT(obd != NULL);
-
-        init_waitqueue_entry(&commit_wait, current);
-        add_wait_queue(&obd->obd_commit_waitq, &commit_wait);
-        rc = l_wait_event(req->rq_reply_waitq,
-                          !req->rq_want_ack || req->rq_resent ||
-                          req->rq_transno <= obd->obd_last_committed, &lwi);
-        remove_wait_queue(&obd->obd_commit_waitq, &commit_wait);
-
-        spin_lock_irqsave (&req->rq_lock, flags);
-        /* If we got here because the ACK callback ran, this acts as a
-         * barrier to ensure the callback completed the wakeup. */
-        spin_unlock_irqrestore (&req->rq_lock, flags);
-
-        /* If we committed the transno already, then we might wake up before
-         * the ack arrives.  We need to stop waiting for the ack before we can
-         * reuse this request structure.  We are guaranteed by this point that
-         * this cannot abort the sending of the actual reply.*/
-        ptlrpc_abort_reply(req);
-
-        if (req->rq_resent) {
-                DEBUG_REQ(D_HA, req, "resent: not cancelling locks");
-                return;
+
+        /* must be an export if locks saved */
+        LASSERT (req->rq_export != NULL);
+        /* req/reply consistent */
+        LASSERT (rs->rs_srv_ni == sni);
+
+        /* "fresh" reply */
+        LASSERT (!rs->rs_scheduled);
+        LASSERT (!rs->rs_scheduled_ever);
+        LASSERT (!rs->rs_handled);
+        LASSERT (!rs->rs_on_net);
+        LASSERT (rs->rs_export == NULL);
+        LASSERT (list_empty(&rs->rs_obd_list));
+        LASSERT (list_empty(&rs->rs_exp_list));
+
+        exp = class_export_get (req->rq_export);
+        obd = exp->exp_obd;
+
+        /* disable reply scheduling onto srv_reply_queue while I'm setting up */
+        rs->rs_scheduled = 1;
+        rs->rs_on_net    = 1;
+        rs->rs_xid       = req->rq_xid;
+        rs->rs_transno   = req->rq_transno;
+        rs->rs_export    = exp;
+        
+        spin_lock_irqsave (&obd->obd_uncommitted_replies_lock, flags);
+
+        if (rs->rs_transno > obd->obd_last_committed) {
+                /* not committed already */ 
+                list_add_tail (&rs->rs_obd_list, 
+                               &obd->obd_uncommitted_replies);
         }
 
-        LASSERT(rc == 0);
-        DEBUG_REQ(D_HA, req, "cancelling locks for %s",
-                  req->rq_want_ack ? "commit" : "ack");
+        spin_unlock (&obd->obd_uncommitted_replies_lock);
+        spin_lock (&exp->exp_lock);
 
-        exp->exp_outstanding_reply = NULL;
+        list_add_tail (&rs->rs_exp_list, &exp->exp_outstanding_replies);
 
-        for (ack_lock = req->rq_ack_locks, i = 0;
-             i < REQ_MAX_ACK_LOCKS; i++, ack_lock++) {
-                if (!ack_lock->mode)
-                        continue;
-                ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
+        spin_unlock_irqrestore (&exp->exp_lock, flags);
+
+        netrc = target_send_reply_msg (req, rc, fail_id);
+
+        spin_lock_irqsave (&svc->srv_lock, flags);
+
+        svc->srv_n_difficult_replies++;
+
+        if (netrc != 0) /* error sending: reply is off the net */
+                rs->rs_on_net = 0;
+
+        if (!rs->rs_on_net ||                   /* some notifier */
+            list_empty(&rs->rs_exp_list) ||     /* completed already */
+            list_empty(&rs->rs_obd_list)) {
+                list_add_tail (&rs->rs_list, &svc->srv_reply_queue);
+                wake_up (&svc->srv_waitq);
+        } else {
+                list_add (&rs->rs_list, &sni->sni_active_replies);
+                rs->rs_scheduled = 0;           /* allow notifier to schedule */
         }
+
+        spin_unlock_irqrestore (&svc->srv_lock, flags);
 }
 
 int target_handle_ping(struct ptlrpc_request *req)
 {
         return lustre_pack_reply(req, 0, NULL, NULL);
 }
-
-void *ldlm_put_lock_into_req(struct ptlrpc_request *req,
-                                struct lustre_handle *lock, int mode)
-{
-        int i;
-
-        for (i = 0; i < REQ_MAX_ACK_LOCKS; i++) {
-                if (req->rq_ack_locks[i].mode)
-                        continue;
-                CDEBUG(D_HA, "saving lock "LPX64" in req %p ack_lock[%d]\n",
-                       lock->cookie, req, i);
-                memcpy(&req->rq_ack_locks[i].lock, lock, sizeof(*lock));
-                req->rq_ack_locks[i].mode = mode;
-                return &req->rq_ack_locks[i];
-        }
-        CERROR("no space for lock in struct ptlrpc_request\n");
-        LBUG();
-        return NULL;
-}
index 9ed2684..5fde33e 100644 (file)
@@ -890,7 +890,28 @@ static int reprocess_one_queue(struct ldlm_resource *res, void *closure)
 
 void ldlm_reprocess_all_ns(struct ldlm_namespace *ns)
 {
-        (void)ldlm_namespace_foreach_res(ns, reprocess_one_queue, NULL);
+        int i, rc;
+
+        l_lock(&ns->ns_lock);
+        for (i = 0; i < RES_HASH_SIZE; i++) {
+                struct list_head *tmp, *next;
+                list_for_each_safe(tmp, next, &(ns->ns_hash[i])) {
+                        struct ldlm_resource *res =
+                                list_entry(tmp, struct ldlm_resource, lr_hash);
+
+                        ldlm_resource_getref(res);
+                        l_unlock(&ns->ns_lock);
+                        rc = reprocess_one_queue(res, NULL);
+                        l_lock(&ns->ns_lock);
+                        next = tmp->next;
+                        ldlm_resource_putref(res);
+                        if (rc == LDLM_ITER_STOP)
+                                GOTO(out, rc);
+                }
+        }
+ out:
+        l_unlock(&ns->ns_lock);
+        EXIT;
 }
 
 void ldlm_reprocess_all(struct ldlm_resource *res)
index e1fe658..2d7946b 100644 (file)
@@ -389,23 +389,17 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
         req->rq_timeout = 2; /* 2 second timeout for initial AST reply */
         rc = ptlrpc_queue_wait(req);
         if (rc == -ETIMEDOUT || rc == -EINTR) {
-#ifdef __KERNEL__
-                ldlm_del_waiting_lock(lock);
-                ldlm_failed_ast(lock, rc, "blocking");
-#else
-                /* XXX
-                 * Here we treat all clients as liblustre. When BLOCKING AST
-                 * timeout we don't evicting the client and only cancel
-                 * the lock.
-                 * restore to orignial implementation later!!!
-                 * XXX
-                 */
-                CERROR("BLOCKING AST to client (nid "LPU64") timeout, "
-                       "simply cancel lock 0x%p\n",
-                       req->rq_peer.peer_nid, lock);
-                ldlm_lock_cancel(lock);
-                rc = -ERESTART;
-#endif
+                LASSERT(lock->l_export);
+                if (lock->l_export->exp_libclient) {
+                        CDEBUG(D_HA, "BLOCKING AST to liblustre client (nid "
+                               LPU64") timeout, simply cancel lock 0x%p\n",
+                               req->rq_peer.peer_nid, lock);
+                        ldlm_lock_cancel(lock);
+                        rc = -ERESTART;
+                } else {
+                        ldlm_del_waiting_lock(lock);
+                        ldlm_failed_ast(lock, rc, "blocking");
+                }
         } else if (rc) {
                 if (rc == -EINVAL)
                         CDEBUG(D_DLMTRACE, "client (nid "LPU64") returned %d "
@@ -1145,9 +1139,8 @@ static int ldlm_setup(void)
 #endif
 
         ldlm->ldlm_cb_service =
-                ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE,
-                                LDLM_MAXREQSIZE, LDLM_CB_REQUEST_PORTAL,
-                                LDLM_CB_REPLY_PORTAL,
+                ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE,
+                                LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
                                 ldlm_callback_handler, "ldlm_cbd",
                                 ldlm_svc_proc_dir);
 
@@ -1157,8 +1150,8 @@ static int ldlm_setup(void)
         }
 
         ldlm->ldlm_cancel_service =
-                ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE,
-                                LDLM_MAXREQSIZE, LDLM_CANCEL_REQUEST_PORTAL,
+                ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, 
+                                LDLM_CANCEL_REQUEST_PORTAL,
                                 LDLM_CANCEL_REPLY_PORTAL,
                                 ldlm_cancel_handler, "ldlm_canceld",
                                 ldlm_svc_proc_dir);
@@ -1404,4 +1397,3 @@ EXPORT_SYMBOL(target_queue_recovery_request);
 EXPORT_SYMBOL(target_handle_ping);
 EXPORT_SYMBOL(target_handle_disconnect);
 EXPORT_SYMBOL(target_queue_final_reply);
-EXPORT_SYMBOL(ldlm_put_lock_into_req);
index ef4fa2f..6622485 100644 (file)
@@ -1,63 +1,46 @@
 ## Liblustre excecutables & libraries Makefile
 DEFS=
 
+SUBDIRS = . tests
+
 CFLAGS := -g -Wall -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include \
-          -I$(top_srcdir)/portals/unals -I$(SYSIO)/include \
-          -I/opt/lam/include -L/opt/lam/lib
+          -I$(top_srcdir)/portals/unals -I$(SYSIO)/include
 
-KFLAGS:=
 CPPFLAGS = $(HAVE_EFENCE) -D_LARGEFILE64_SOURCE=1
 LIBS = $(LIBEFENCE)
 
-## lustre components libs
-LLIBS := ./libllite.a \
-         ../lov/liblov.a \
-         ../obdecho/libobdecho.a \
-         ../osc/libosc.a \
-         ../mdc/libmdc.a \
-         ../ldlm/libldlm.a \
-         ../ptlrpc/libptlrpc.a \
-         ../obdclass/liblustreclass.a \
-        ../lvfs/liblvfs.a
-
-## portals components libs
-PTLLIBS := ../portals/utils/libptlctl.a \
-          ../portals/unals/libtcpnal.a \
-           ../portals/portals/libportals.a
-
-## sysio components libs
-SYSIOLIBS := $(SYSIO)/drivers/native/libsysio_native.a \
+LUSTRE_LIBS = libllite.a \
+              $(top_srcdir)/lov/liblov.a \
+              $(top_srcdir)/obdecho/libobdecho.a \
+              $(top_srcdir)/osc/libosc.a \
+              $(top_srcdir)/mdc/libmdc.a \
+              $(top_srcdir)/ptlrpc/libptlrpc.a \
+              $(top_srcdir)/obdclass/liblustreclass.a \
+              $(top_srcdir)/lvfs/liblvfs.a
+
+PTL_LIBS = $(top_srcdir)/portals/utils/libuptlctl.a \
+           $(top_srcdir)/portals/unals/libtcpnal.a \
+           $(top_srcdir)/portals/portals/libportals.a
+
+SYSIO_LIBS = $(SYSIO)/drivers/native/libsysio_native.a \
              $(SYSIO)/drivers/sockets/libsysio_sockets.a \
              $(SYSIO)/src/libsysio.a \
              $(SYSIO)/dev/stdfd/libsysio_stdfd.a
 
-LLIB_EXEC= $(PTLLIBS) $(SYSIOLIBS) -lpthread
+#SYSIO_LIBS = $(SYSIO)/lib/libsysio.a
 
-lib_LIBRARIES = 
-noinst_LIBRARIES = libllite.a libtestcommon.a
-libllite_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c
-libtestcommon_a_SOURCES = test_common.c
+lib_LIBRARIES = liblustre.a
+noinst_LIBRARIES = libllite.a
 
-bin_PROGRAMS = libtest lltest recovery_small replay_single #test_lock_cancel
+libllite_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c
+libllite_a_CFLAGS = -fPIC
 
-libtest_SOURCES = libtest.c  ../utils/parser.c ../utils/obd.c ../utils/lustre_cfg.c
-libtest_LDADD := $(LLIBS) $(PTLLIBS) \
-                 $(LIBREADLINE) -lpthread 
+# for make rpms -- need cleanup
+liblustre_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c
+liblustre_a_CFLAGS = -fPIC
 
-liblustre.a : libllite.a
+liblustre.a : $(LUSTRE_LIBS) $(PTL_LIBS) $(SYSIO_LIBS)
        $(shell ./genlib.sh $(SYSIO) $(AR) $(LINK))
 
-lltest_SOURCES = lltest.c
-lltest_LDADD := ./libtestcommon.a $(LLIBS) $(LLIB_EXEC) $(LIBREADLINE)
-
-recovery_small_SOURCES = recovery_small.c
-recovery_small_LDADD := ./libtestcommon.a $(LLIBS) $(LLIB_EXEC) $(LIBREADLINE)
-
-replay_single_SOURCES = replay_single.c
-replay_single_LDADD := ./libtestcommon.a $(LLIBS) $(LLIB_EXEC) $(LIBREADLINE)
-
-#test_lock_cancel_SOURCES = test_lock_cancel.c
-#test_lock_cancel_LDADD :=  $(LLIBS) $(LLIB_EXEC) -lmpi -llam
-
 include $(top_srcdir)/Rules
 
diff --git a/lustre/liblustre/dir.c b/lustre/liblustre/dir.c
new file mode 100644 (file)
index 0000000..cceb1e0
--- /dev/null
@@ -0,0 +1,220 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light Super operations
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/fcntl.h>
+#include <sys/queue.h>
+
+#include <sysio.h>
+#include <fs.h>
+#include <mount.h>
+#include <inode.h>
+#include <file.h>
+
+#undef LIST_HEAD
+
+#include <linux/types.h>
+#include <linux/dirent.h>
+#include <linux/unistd.h>
+
+#include "llite_lib.h"
+
+static int llu_dir_do_readpage(struct inode *inode, struct page *page)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct llu_sb_info *sbi = llu_i2sbi(inode);
+        struct ll_fid mdc_fid;
+        __u64 offset;
+        int rc = 0;
+        struct ptlrpc_request *request;
+        struct lustre_handle lockh;
+        struct mds_body *body;
+        struct lookup_intent it = { .it_op = IT_READDIR };
+        struct mdc_op_data data;
+        struct obd_device *obddev = class_exp2obd(sbi->ll_mdc_exp);
+        struct ldlm_res_id res_id =
+                { .name = {lli->lli_st_ino, (__u64)lli->lli_st_generation} };
+        ENTRY;
+
+        if ((lli->lli_st_size + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT <= page->index) {
+                /* XXX why do we need this exactly, and why do we think that
+                 *     an all-zero directory page is useful?
+                 */
+                CERROR("memsetting dir page %lu to zero (size %lld)\n",
+                       page->index, lli->lli_st_size);
+                memset(page->addr, 0, PAGE_CACHE_SIZE);
+                GOTO(readpage_out, rc);
+        }
+
+        rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
+                             &res_id, LDLM_PLAIN, NULL, 0, LCK_PR, &lockh);
+        if (!rc) {
+                llu_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0);
+
+                rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_PLAIN, &it, LCK_PR,
+                                 &data, &lockh, NULL, 0,
+                                 ldlm_completion_ast, llu_mdc_blocking_ast,
+                                 inode);
+                request = (struct ptlrpc_request *)it.d.lustre.it_data;
+                if (request)
+                        ptlrpc_req_finished(request);
+                if (rc < 0) {
+                        CERROR("lock enqueue: err: %d\n", rc);
+                        RETURN(rc);
+                }
+        }
+        ldlm_lock_dump_handle(D_OTHER, &lockh);
+
+        mdc_pack_fid(&mdc_fid, lli->lli_st_ino, lli->lli_st_generation, S_IFDIR);
+
+        offset = page->index << PAGE_SHIFT;
+        rc = mdc_readpage(sbi->ll_mdc_exp, &mdc_fid,
+                          offset, page, &request);
+        if (!rc) {
+                body = lustre_msg_buf(request->rq_repmsg, 0, sizeof (*body));
+                LASSERT (body != NULL);         /* checked by mdc_readpage() */
+                LASSERT_REPSWABBED (request, 0); /* swabbed by mdc_readpage() */
+
+                lli->lli_st_size = body->size;
+        }
+        ptlrpc_req_finished(request);
+        EXIT;
+
+ readpage_out:
+        ldlm_lock_decref(&lockh, LCK_PR);
+        return rc;
+}
+
+static struct page *llu_dir_read_page(struct inode *ino, int pgidx)
+{
+        struct page *page;
+        int rc;
+        ENTRY;
+
+        page = alloc_page(0);
+        if (!page) {
+                CERROR("alloc page failed\n");
+                RETURN(ERR_PTR(-ENOMEM));
+        }
+        page->index = pgidx;
+
+        rc = llu_dir_do_readpage(ino, page);
+        if (rc) {
+                free_page(page);
+                RETURN(ERR_PTR(rc));
+        }
+
+        return page;
+}
+
+#define NAME_OFFSET(de) ((int) ((de)->d_name - (char *) (de)))
+#define ROUND_UP64(x)   (((x)+sizeof(__u64)-1) & ~(sizeof(__u64)-1))
+
+static int filldir(char *buf, int buflen,
+                   const char *name, int namelen, loff_t offset,
+                   ino_t ino, unsigned int d_type, int *filled)
+{
+        struct dirent64 *dirent = (struct dirent64 *) (buf + *filled);
+        int reclen = ROUND_UP64(NAME_OFFSET(dirent) + namelen + 1);
+
+        /* check overflow */
+        if ((*filled + reclen) > buflen)
+                return 1;
+
+        dirent->d_ino = ino;
+        dirent->d_off = offset,
+        dirent->d_reclen = reclen;
+        dirent->d_type = (unsigned short) d_type;
+        memcpy(dirent->d_name, name, namelen);
+        dirent->d_name[namelen] = 0;
+
+        *filled += reclen;
+
+        return 0;
+}
+
+ssize_t llu_iop_getdirentries(struct inode *ino, char *buf, size_t nbytes,
+                              _SYSIO_OFF_T *basep)
+{
+        struct llu_inode_info *lli = llu_i2info(ino);
+        loff_t pos = *basep, offset;
+        int maxpages, pgidx, filled = 0;
+        ENTRY;
+
+        if (pos == -1)
+                pos = lli->lli_dir_pos;
+
+        maxpages = lli->lli_st_size >> PAGE_CACHE_SHIFT;
+        pgidx = pos >> PAGE_CACHE_SHIFT;
+        offset = pos & ~PAGE_CACHE_MASK;
+
+        for ( ; pgidx < maxpages ; pgidx++, offset = 0) {
+                struct page *page;
+                struct ext2_dirent *de;
+                char *addr, *limit;
+
+                page = llu_dir_read_page(ino, pgidx);
+                if (IS_ERR(page))
+                        continue;
+
+                /* size might have been updated by mdc_readpage */
+                maxpages = lli->lli_st_size >> PAGE_CACHE_SHIFT;
+
+                /* fill in buffer */
+                addr = page->addr;
+                limit = addr + PAGE_CACHE_SIZE - EXT2_DIR_REC_LEN(1);
+                de = (struct ext2_dirent *) (addr + offset);
+
+                for ( ; (char*) de <= limit; de = ext2_next_entry(de)) {
+                        if (de->inode) {
+                                int over;
+                                unsigned char d_type = 0;
+
+                                /* XXX handle type, etc here */
+
+                                offset = (char*) de - addr;
+                                over =  filldir(buf, nbytes, de->name, de->name_len,
+                                                (pgidx << PAGE_CACHE_SHIFT) | offset,
+                                                le32_to_cpu(de->inode), d_type, &filled);
+                                if (over) {
+                                        free_page(page);
+                                        GOTO(done, 0);
+                                }
+                        }
+                }
+                
+                free_page(page);
+        }
+done:
+        lli->lli_dir_pos = pgidx << PAGE_CACHE_SHIFT | offset;
+        *basep = lli->lli_dir_pos;
+        RETURN(filled);
+}
index 58339c8..de3f35e 100644 (file)
@@ -36,6 +36,8 @@
 #include <inode.h>
 #include <file.h>
 
+#undef LIST_HEAD
+
 #include "llite_lib.h"
 
 void llu_prepare_mdc_op_data(struct mdc_op_data *data,
@@ -89,105 +91,6 @@ void obdo_refresh_inode(struct inode *dst,
                 lli->lli_st_blocks = src->o_blocks;
 }
 
-#if 0
-static int llu_create_obj(struct lustre_handle *conn, struct inode *inode,
-                          struct lov_stripe_md *lsm)
-{
-        struct ptlrpc_request *req = NULL;
-        struct llu_inode_info *lli = llu_i2info(inode);
-        struct lov_mds_md *lmm = NULL;
-        struct obdo *oa;
-        struct iattr iattr;
-        struct mdc_op_data op_data;
-        struct obd_trans_info oti = { 0 };
-        int rc, err, lmm_size = 0;;
-        ENTRY;
-
-        oa = obdo_alloc();
-        if (!oa)
-                RETURN(-ENOMEM);
-
-        LASSERT(S_ISREG(inode->i_mode));
-        oa->o_mode = S_IFREG | 0600;
-        oa->o_id = lli->lli_st_ino;
-        oa->o_generation = lli->lli_st_generation;
-        /* Keep these 0 for now, because chown/chgrp does not change the
-         * ownership on the OST, and we don't want to allow BA OST NFS
-         * users to access these objects by mistake.
-         */
-        oa->o_uid = 0;
-        oa->o_gid = 0;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FLTYPE |
-                OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID;
-
-        obdo_from_inode(oa, inode, OBD_MD_FLTYPE|OBD_MD_FLATIME|OBD_MD_FLMTIME|
-                        OBD_MD_FLCTIME |
-                        (llu_i2info(inode)->lli_st_size ? OBD_MD_FLSIZE : 0));
-
-        rc = obd_create(conn, oa, &lsm, &oti);
-        if (rc) {
-                CERROR("error creating objects for inode %lu: rc = %d\n",
-                       lli->lli_st_ino, rc);
-                if (rc > 0) {
-                        CERROR("obd_create returned invalid rc %d\n", rc);
-                        rc = -EIO;
-                }
-                GOTO(out_oa, rc);
-        }
-        obdo_refresh_inode(inode, oa, OBD_MD_FLBLKSZ);
-
-        LASSERT(lsm && lsm->lsm_object_id);
-        rc = obd_packmd(conn, &lmm, lsm);
-        if (rc < 0)
-                GOTO(out_destroy, rc);
-
-        lmm_size = rc;
-
-        /* Save the stripe MD with this file on the MDS */
-        memset(&iattr, 0, sizeof(iattr));
-        iattr.ia_valid = ATTR_FROM_OPEN;
-
-        llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
-
-        rc = mdc_setattr(&llu_i2sbi(inode)->ll_mdc_conn, &op_data,
-                         &iattr, lmm, lmm_size, oti.oti_logcookies,
-                         oti.oti_numcookies * sizeof(oti.oti_onecookie), &req);
-        ptlrpc_req_finished(req);
-
-        obd_free_diskmd(conn, &lmm);
-
-        /* If we couldn't complete mdc_open() and store the stripe MD on the
-         * MDS, we need to destroy the objects now or they will be leaked.
-         */
-        if (rc) {
-                CERROR("error: storing stripe MD for %lu: rc %d\n",
-                       lli->lli_st_ino, rc);
-                GOTO(out_destroy, rc);
-        }
-        lli->lli_smd = lsm;
-        lli->lli_maxbytes = lsm->lsm_maxbytes;
-
-        EXIT;
-out_oa:
-        oti_free_cookies(&oti);
-        obdo_free(oa);
-        return rc;
-
-out_destroy:
-        oa->o_id = lsm->lsm_object_id;
-        oa->o_valid = OBD_MD_FLID;
-        obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
-
-        err = obd_destroy(conn, oa, lsm, NULL);
-        obd_free_memmd(conn, &lsm);
-        if (err) {
-                CERROR("error uncreating inode %lu objects: rc %d\n",
-                       lli->lli_st_ino, err);
-        }
-        goto out_oa;
-}
-#endif
-
 static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
 {
         struct ptlrpc_request *req = it->d.lustre.it_data;
@@ -210,8 +113,6 @@ static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
          * ll_mdc_close, so don't even try right now. */
         LASSERT(fd != NULL);
 
-        memset(fd, 0, sizeof(*fd));
-
         memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
         fd->fd_mds_och.och_magic = OBD_CLIENT_HANDLE_MAGIC;
         lli->lli_file_data = fd;
@@ -221,38 +122,6 @@ static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
         RETURN(0);
 }
 
-#if 0
-static int llu_osc_open(struct lustre_handle *conn, struct inode *inode,
-                        struct lov_stripe_md *lsm)
-{
-        struct ll_file_data *fd = llu_i2info(inode)->lli_file_data;
-        struct obdo *oa;
-        int rc;
-        ENTRY;
-
-        oa = obdo_alloc();
-        if (!oa)
-                RETURN(-ENOMEM);
-        oa->o_id = lsm->lsm_object_id;
-        oa->o_mode = S_IFREG;
-        oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS |
-                       OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-        rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och);
-        if (rc)
-                GOTO(out, rc);
-
-        /* file->f_flags &= ~O_LOV_DELAY_CREATE; */
-        obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLMTIME |
-                      OBD_MD_FLCTIME);
-
-        EXIT;
-out:
-        obdo_free(oa);
-        return rc;
-}
-#endif
-
-
 int llu_iop_open(struct pnode *pnode, int flags, mode_t mode)
 {
         struct inode *inode = pnode->p_base->pb_ino;
@@ -264,19 +133,15 @@ int llu_iop_open(struct pnode *pnode, int flags, mode_t mode)
         int rc = 0;
         ENTRY;
 
+        /* don't do anything for '/' */
+        if (llu_is_root_inode(inode))
+                RETURN(0);
+
         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", lli->lli_st_ino);
         LL_GET_INTENT(inode, it);
 
         if (!it->d.lustre.it_disposition) {
-#if 0
-                struct lookup_intent oit = { .it_op = IT_OPEN,
-                                             .it_flags = file->f_flags };
-                it = &oit;
-                rc = ll_intent_file_open(file, NULL, 0, it);
-                if (rc)
-                        GOTO(out_release, rc);
-#endif
-                CERROR("fixme!!\n");
+                LBUG();
         }
 
         rc = it_open_error(DISP_OPEN_OPEN, it);
@@ -298,17 +163,6 @@ int llu_iop_open(struct pnode *pnode, int flags, mode_t mode)
                         CDEBUG(D_INODE, "object creation was delayed\n");
                         GOTO(out_release, rc);
                 }
-#if 0
-                if (!lli->lli_smd) {
-                        rc = llu_create_obj(conn, inode, NULL);
-                        if (rc)
-                                GOTO(out_close, rc);
-                } else {
-                        CERROR("warning: stripe already set on ino %lu\n",
-                               lli->lli_st_ino);
-                }
-                lsm = lli->lli_smd;
-#endif
         }
         fd->fd_flags &= ~O_LOV_DELAY_CREATE;
 
@@ -451,12 +305,8 @@ int llu_file_release(struct inode *inode)
         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%lu\n", lli->lli_st_ino,
                lli->lli_st_generation);
 
-        /* FIXME need add this check later. how to find the root pnode? */
-#if 0
-        /* don't do anything for / */
-        if (inode->i_sb->s_root == file->f_dentry)
-                RETURN(0);
-#endif
+        /* XXX don't do anything for '/'. but how to find the root pnode? */
+
         /* still opened by others? */
         if (--lli->lli_open_count)
                 RETURN(0);
index 52b4b88..f371650 100755 (executable)
@@ -1,4 +1,5 @@
 #!/bin/bash
+#set -xv
 
 #
 # This script is to generate lib lustre library as a whole. It will leave
@@ -8,48 +9,80 @@
 
 AR=/usr/bin/ar
 LD=/usr/bin/ld
+RANLIB=/usr/bin/ranlib
 
 CWD=`pwd`
 
 SYSIO=$1
 
+#if [ ! -f $SYSIO/lib/libsysio.a ]; then
+#  echo "ERROR: $SYSIO/lib/libsysio.a dosen't exist"
+#  exit 1
+#fi
+#
+# do cleanup at first
+#rm -f liblustre.so
+
 ALL_OBJS=
 
 build_obj_list() {
   _objs=`$AR -t $1/$2`
   for _lib in $_objs; do
-  ALL_OBJS=$ALL_OBJS"$1/$_lib ";
+    ALL_OBJS=$ALL_OBJS"$1/$_lib ";
   done;
 }
 
+#
+# special treatment for libsysio
+#
+#sysio_tmp=$CWD/sysio_tmp_`date +%s`
+#build_sysio_obj_list() {
+#  _objs=`$AR -t $1`
+#  mkdir -p $sysio_tmp
+#  $AR -x $1
+#  mv $_objs $sysio_tmp
+#  for _lib in $_objs; do
+#    ALL_OBJS=$ALL_OBJS"$sysio_tmp/$_lib ";
+#  done
+#}
+
 # lustre components libs
 build_obj_list . libllite.a
 build_obj_list ../lov liblov.a
 build_obj_list ../obdecho libobdecho.a
 build_obj_list ../osc libosc.a
 build_obj_list ../mdc libmdc.a
-build_obj_list ../ldlm libldlm.a
 build_obj_list ../ptlrpc libptlrpc.a
 build_obj_list ../obdclass liblustreclass.a
 build_obj_list ../lvfs liblvfs.a
 
 # portals components libs
-build_obj_list ../portals/utils libptlctl.a
+build_obj_list ../portals/utils libuptlctl.a
 build_obj_list ../portals/unals libtcpnal.a
 build_obj_list ../portals/portals libportals.a
 
+# create static lib lsupport
+rm -f $CWD/liblsupport.a
+$AR -cru $CWD/liblsupport.a $ALL_OBJS
+$RANLIB $CWD/liblsupport.a
+
 # libsysio components libs
 build_obj_list $SYSIO/drivers/native libsysio_native.a
 build_obj_list $SYSIO/drivers/sockets libsysio_sockets.a
 build_obj_list $SYSIO/src libsysio.a
 build_obj_list $SYSIO/dev/stdfd libsysio_stdfd.a
+#
+#build_sysio_obj_list $SYSIO/lib/libsysio.a
+#
 
-
-# create static lib
+# create static lib lustre
 rm -f $CWD/liblustre.a
-$AR -r $CWD/liblustre.a $ALL_OBJS
+$AR -cru $CWD/liblustre.a $ALL_OBJS
+$RANLIB $CWD/liblustre.a
 
-# create shared lib
+# create shared lib lustre
 rm -f $CWD/liblustre.so
 $LD -shared -o $CWD/liblustre.so -init __liblustre_setup_ -fini __liblustre_cleanup_ \
        $ALL_OBJS -lpthread
+
+#rm -rf $sysio_tmp
diff --git a/lustre/liblustre/libtest.c b/lustre/liblustre/libtest.c
deleted file mode 100644 (file)
index b956347..0000000
+++ /dev/null
@@ -1,246 +0,0 @@
-#include <stdio.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <arpa/inet.h>
-
-#include <portals/api-support.h> /* needed for ptpctl.h */
-#include <portals/ptlctl.h>    /* needed for parse_dump */
-
-
-#include <liblustre.h>
-#include <linux/obd.h>
-#include <linux/obd_class.h>
-#include <procbridge.h>
-
-#define LIBLUSTRE_TEST 1
-#include "../utils/lctl.c"
-
-struct ldlm_namespace;
-struct ldlm_res_id;
-struct obd_import;
-
-void *inter_module_get(char *arg)
-{
-        if (!strcmp(arg, "tcpnal_ni"))
-                return &tcpnal_ni;
-        else if (!strcmp(arg, "ldlm_cli_cancel_unused"))
-                return ldlm_cli_cancel_unused;
-        else if (!strcmp(arg, "ldlm_namespace_cleanup"))
-                return ldlm_namespace_cleanup;
-        else if (!strcmp(arg, "ldlm_replay_locks"))
-                return ldlm_replay_locks;
-        else
-                return NULL;
-}
-
-/* XXX move to proper place */
-char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
-{
-        switch(nal){
-        case TCPNAL:
-                /* userspace NAL */
-        case SOCKNAL:
-                sprintf(str, "%u:%d.%d.%d.%d", (__u32)(nid >> 32),
-                        HIPQUAD(nid));
-                break;
-        case QSWNAL:
-        case GMNAL:
-        case IBNAL:
-        case SCIMACNAL:
-                sprintf(str, "%u:%u", (__u32)(nid >> 32), (__u32)nid);
-                break;
-        default:
-                return NULL;
-        }
-        return str;
-}
-
-ptl_handle_ni_t         tcpnal_ni;
-
-struct pingcli_args {
-        ptl_nid_t mynid;
-        ptl_nid_t nid;
-       ptl_pid_t port;
-        int count;
-        int size;
-};
-
-struct task_struct *current;
-
-struct obd_class_user_state ocus;
-
-/* portals interfaces */
-ptl_handle_ni_t *
-kportal_get_ni (int nal)
-{
-        switch (nal)
-        {
-        case SOCKNAL:
-                return &tcpnal_ni;
-        default:
-                return NULL;
-        }
-}
-
-inline void
-kportal_put_ni (int nal)
-{
-        return;
-}
-
-int
-kportal_nal_cmd(struct portals_cfg *pcfg)
-{
-#if 0
-        __u32 nal = pcfg->pcfg_nal;
-        int rc = -EINVAL;
-
-        ENTRY;
-
-        down(&nal_cmd_sem);
-        if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) {
-                CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, 
-                       pcfg->pcfg_command);
-                rc = nal_cmd[nal].nch_handler(pcfg, nal_cmd[nal].nch_private);
-        }
-        up(&nal_cmd_sem);
-        RETURN(rc);
-#else
-        CERROR("empty function!!!\n");
-        return 0;
-#endif
-}
-
-int init_current(int argc, char **argv)
-{ 
-        current = malloc(sizeof(*current));
-        strncpy(current->comm, argv[0], sizeof(current->comm));
-        current->pid = getpid();
-       return 0;
-}
-
-ptl_nid_t tcpnal_mynid;
-
-int init_lib_portals()
-{
-        int rc;
-
-        PtlInit();
-        rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni);
-        if (rc != 0) {
-                CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
-                PtlFini();
-                RETURN (rc);
-        }
-        PtlNIDebug(tcpnal_ni, ~0);
-        return rc;
-}
-
-extern int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, unsigned long arg);
-
-
-int lib_ioctl_nalcmd(int dev_id, int opc, void * ptr)
-{
-        struct portal_ioctl_data *ptldata;
-
-        if (opc == IOC_PORTAL_NAL_CMD) {
-                ptldata = (struct portal_ioctl_data *) ptr;
-
-                if (ptldata->ioc_nal_cmd == NAL_CMD_REGISTER_MYNID) {
-                        tcpnal_mynid = ptldata->ioc_nid;
-                        printf("mynid: %u.%u.%u.%u\n",
-                                (unsigned)(tcpnal_mynid>>24) & 0xFF,
-                                (unsigned)(tcpnal_mynid>>16) & 0xFF,
-                                (unsigned)(tcpnal_mynid>>8) & 0xFF,
-                                (unsigned)(tcpnal_mynid) & 0xFF);
-                }
-        }
-
-       return (0);
-}
-
-int lib_ioctl(int dev_id, int opc, void * ptr)
-{
-
-       if (dev_id == OBD_DEV_ID) {
-               class_handle_ioctl(&ocus, opc, (unsigned long)ptr);
-
-               /* you _may_ need to call obd_ioctl_unpack or some
-                  other verification function if you want to use ioc
-                  directly here */
-#if 0
-               printf ("processing ioctl cmd: %x buf len: %d\n", 
-                       opc,  ioc->ioc_len);
-#endif
-       }
-       return (0);
-}
-
-int liblustre_ioctl(int dev_id, int opc, void *ptr)
-{
-       int   rc = -EINVAL;
-       
-       switch (dev_id) {
-       default:
-               fprintf(stderr, "Unexpected device id %d\n", dev_id);
-               abort();
-               break;
-               
-       case OBD_DEV_ID:
-               rc = class_handle_ioctl(&ocus, opc, (unsigned long)ptr);
-               break;
-       }
-
-       return rc;
-}
-
-extern int time_ptlwait1;
-extern int time_ptlwait2;
-extern int time_ptlselect;
-int main(int argc, char **argv) 
-{
-        char *config_file;
-
-        if (argc > 2) {
-                printf("Usage: %s [config_file]\n", argv[0]);
-                return 1;
-        }
-
-        if (argc == 2) {
-                config_file = argv[1];
-               argc--;
-               argv++;
-       } else
-                config_file = "/tmp/DUMP_FILE";
-
-        srand(time(NULL));
-
-        INIT_LIST_HEAD(&ocus.ocus_conns);
-#if 1
-       portal_debug = 0;
-       portal_subsystem_debug = 0;
-#endif
-       parse_dump(config_file, lib_ioctl_nalcmd);
-
-        if (init_current(argc, argv) ||
-           init_obdclass() || init_lib_portals() ||
-           ptlrpc_init() ||
-           ldlm_init() ||
-           mdc_init() ||
-           lov_init() ||
-           osc_init() ||
-           echo_client_init()) {
-               printf("error\n");
-               return 1;
-       }
-
-       parse_dump(config_file, lib_ioctl);
-
-       set_ioc_handler(liblustre_ioctl);
-#if 0  
-       portal_debug = -1;
-       portal_subsystem_debug = -1;
-#endif
-       return lctl_main(argc, argv);
-}
-
index baf564a..1cb6a37 100644 (file)
 #include <inode.h>
 #include <file.h>
 
+/* both sys/queue.h (libsysio require it) and portals/lists.h have definition
+ * of 'LIST_HEAD'. undef it to suppress warnings
+ */
+#undef LIST_HEAD
+
 #include <portals/api-support.h> /* needed for ptpctl.h */
 #include <portals/ptlctl.h>    /* needed for parse_dump */
 #include <procbridge.h>
@@ -45,8 +50,7 @@
 
 
 ptl_handle_ni_t         tcpnal_ni;
-struct task_struct *current;
-struct obd_class_user_state ocus;
+struct task_struct     *current;
 
 /* portals interfaces */
 ptl_handle_ni_t *
@@ -141,7 +145,7 @@ int init_lib_portals()
         PtlInit();
         rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni);
         if (rc != 0) {
-                CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
+                CERROR("TCPNAL: PtlNIInit failed: error %d\n", rc);
                 PtlFini();
                 RETURN (rc);
         }
@@ -156,7 +160,7 @@ kportal_nal_cmd(struct portals_cfg *pcfg)
         return 0;
 }
 
-extern int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, unsigned long arg);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
 
 int lib_ioctl_nalcmd(int dev_id, int opc, void * ptr)
 {
@@ -190,7 +194,7 @@ int lib_ioctl(int dev_id, int opc, void * ptr)
                 ioc->ioc_pbuf1 = ioc->ioc_bulk;
                 //XXX
 
-                rc = class_handle_ioctl(&ocus, opc, (unsigned long)ptr);
+                rc = class_handle_ioctl(opc, (unsigned long)ptr);
 
                 printf ("proccssing ioctl cmd: %x, rc %d\n", opc,  rc);
 
@@ -202,8 +206,6 @@ int lib_ioctl(int dev_id, int opc, void * ptr)
 
 int lllib_init(char *dumpfile)
 {
-        INIT_LIST_HEAD(&ocus.ocus_conns);
-
         if (!g_zconf) {
                 /* this parse only get my nid from config file
                  * before initialize portals
@@ -213,7 +215,7 @@ int lllib_init(char *dumpfile)
         } else {
                 /* XXX need setup mynid before tcpnal initialize */
                 tcpnal_mynid = ((uint64_t)getpid() << 32) | time(0);
-                printf("set tcpnal mynid: %016llx\n", tcpnal_mynid);
+                printf("LibLustre: TCPNAL NID: %016llx\n", tcpnal_mynid);
         }
 
         init_current("dummy");
@@ -239,7 +241,7 @@ static void llu_check_request()
 }
 #endif
 
-int liblustre_process_log(struct config_llog_instance *cfg)
+int liblustre_process_log(struct config_llog_instance *cfg, int allow_recov)
 {
         struct lustre_cfg lcfg;
         char  *peer = "MDS_PEER_UUID";
@@ -297,6 +299,11 @@ int liblustre_process_log(struct config_llog_instance *cfg)
         if (obd == NULL)
                 GOTO(out_cleanup, err = -EINVAL);
 
+        /* Disable initial recovery on this import */
+        err = obd_set_info(obd->obd_self_export,
+                           strlen("initial_recov"), "initial_recov",
+                           sizeof(allow_recov), &allow_recov);
+
         err = obd_connect(&mdc_conn, obd, &mdc_uuid);
         if (err) {
                 CERROR("cannot connect to %s: rc = %d\n",
@@ -374,10 +381,13 @@ int ll_parse_mount_target(const char *target, char **mdsnid,
 /* env variables */
 #define ENV_LUSTRE_MNTPNT               "LIBLUSTRE_MOUNT_POINT"
 #define ENV_LUSTRE_MNTTGT               "LIBLUSTRE_MOUNT_TARGET"
+#define ENV_LUSTRE_TIMEOUT              "LIBLUSTRE_TIMEOUT"
 #define ENV_LUSTRE_DUMPFILE             "LIBLUSTRE_DUMPFILE"
 
 extern int _sysio_native_init();
 
+extern unsigned int obd_timeout;
+
 /* global variables */
 int     g_zconf = 0;            /* zeroconf or dumpfile */
 char   *g_zconf_mdsname = NULL; /* mdsname, for zeroconf */
@@ -389,6 +399,7 @@ void __liblustre_setup_(void)
 {
         char *lustre_path = NULL;
         char *target = NULL;
+        char *timeout = NULL;
         char *dumpfile = NULL;
         char *root_driver = "native";
         char *lustre_driver = "llite";
@@ -397,7 +408,10 @@ void __liblustre_setup_(void)
 
        int err;
 
-        srand(time(NULL));
+        /* consider tha case of starting multiple liblustre instances
+         * at a same time on single node.
+         */
+        srand(time(NULL) + getpid());
 
         signal(SIGUSR1, sighandler_USR1);
 
@@ -429,6 +443,13 @@ void __liblustre_setup_(void)
                         lustre_path, target);
         }
 
+        timeout = getenv(ENV_LUSTRE_TIMEOUT);
+        if (timeout) {
+                obd_timeout = (unsigned int) atoi(timeout);
+                printf("LibLustre: set obd timeout as %u seconds\n",
+                        obd_timeout);
+        }
+
        if (_sysio_init() != 0) {
                perror("init sysio");
                exit(1);
index 9e4340d..043be49 100644 (file)
@@ -1,3 +1,26 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light Super operations
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
 #ifndef __LLU_H_
 #define __LLU_H_
 
@@ -20,7 +43,7 @@ struct ll_file_data {
 struct llu_sb_info
 {
         struct obd_uuid         ll_sb_uuid;
-       struct obd_export      *ll_mdc_exp;
+        struct obd_export      *ll_mdc_exp;
         struct obd_export      *ll_osc_exp;
         obd_id                  ll_rootino;
         int                     ll_flags;
@@ -31,6 +54,9 @@ struct llu_sb_info
         char                   *ll_instance; 
 };
 
+#define LL_SBI_NOLCK            0x1
+#define LL_SBI_READAHEAD        0x2
+
 #define LLI_F_HAVE_OST_SIZE_LOCK        0
 #define LLI_F_HAVE_MDS_SIZE_LOCK        1
 #define LLI_F_PREFER_EXTENDED_SIZE      2
@@ -42,7 +68,7 @@ struct llu_inode_info {
         struct lov_stripe_md   *lli_smd;
         char                   *lli_symlink_name;
         struct semaphore        lli_open_sem;
-       __u64                   lli_maxbytes;
+        __u64                   lli_maxbytes;
         unsigned long          lli_flags;
 
         /* for libsysio */
@@ -50,8 +76,10 @@ struct llu_inode_info {
 
         struct lookup_intent   *lli_it;
 
-       /* XXX workaround for libsysio */
+        /* XXX workaround for libsysio unlink */
         int                     lli_stale_flag;
+        /* XXX workaround for libsysio readdir */
+        loff_t                  lli_dir_pos;
 
         /* in libsysio we have no chance to store data in file,
          * so place it here. since it's possible that an file
@@ -60,24 +88,24 @@ struct llu_inode_info {
         struct ll_file_data    *lli_file_data;
         int                     lli_open_count;
 
-       /* stat FIXME not 64 bit clean */
-       dev_t                   lli_st_dev;
-       ino_t                   lli_st_ino;
-       mode_t                  lli_st_mode;
-       nlink_t                 lli_st_nlink;
-       uid_t                   lli_st_uid;
-       gid_t                   lli_st_gid;
-       dev_t                   lli_st_rdev;
-       loff_t                  lli_st_size;
-       unsigned int            lli_st_blksize;
-       unsigned int            lli_st_blocks;
-       time_t                  lli_st_atime;
-       time_t                  lli_st_mtime;
-       time_t                  lli_st_ctime;
-
-       /* not for stat, change it later */
-       int                     lli_st_flags;
-       unsigned long           lli_st_generation;
+        /* stat FIXME not 64 bit clean */
+        dev_t                   lli_st_dev;
+        ino_t                   lli_st_ino;
+        mode_t                  lli_st_mode;
+        nlink_t                 lli_st_nlink;
+        uid_t                   lli_st_uid;
+        gid_t                   lli_st_gid;
+        dev_t                   lli_st_rdev;
+        loff_t                  lli_st_size;
+        unsigned int            lli_st_blksize;
+        unsigned int            lli_st_blocks;
+        time_t                  lli_st_atime;
+        time_t                  lli_st_mtime;
+        time_t                  lli_st_ctime;
+
+        /* not for stat, change it later */
+        int                    lli_st_flags;
+        unsigned long          lli_st_generation;
 };
 
 #define LLU_SYSIO_COOKIE_SIZE(x) \
@@ -87,8 +115,9 @@ struct llu_inode_info {
 
 struct llu_sysio_cookie {
         struct obd_sync_io_container *lsc_osic;
-       struct inode           *lsc_inode;
-       int                     lsc_npages;
+        struct inode           *lsc_inode;
+        int                     lsc_maxpages;
+        int                     lsc_npages;
         struct ll_async_page   *lsc_llap;
         struct page            *lsc_pages;
         __u64                   lsc_rwcount;
@@ -99,18 +128,18 @@ struct llu_sysio_cookie {
 
 struct llu_sysio_callback_args
 {
-       int ncookies;
-       struct llu_sysio_cookie *cookies[MAX_IOVEC];
+        int ncookies;
+        struct llu_sysio_cookie *cookies[MAX_IOVEC];
 };
 
 static inline struct llu_sb_info *llu_fs2sbi(struct filesys *fs)
 {
-       return (struct llu_sb_info*)(fs->fs_private);
+        return (struct llu_sb_info*)(fs->fs_private);
 }
 
 static inline struct llu_inode_info *llu_i2info(struct inode *inode)
 {
-       return (struct llu_inode_info*)(inode->i_private);
+        return (struct llu_inode_info*)(inode->i_private);
 }
 
 static inline struct llu_sb_info *llu_i2sbi(struct inode *inode)
@@ -118,16 +147,6 @@ static inline struct llu_sb_info *llu_i2sbi(struct inode *inode)
         return llu_i2info(inode)->lli_sbi;
 }
 
-#if 0
-static inline struct client_obd *sbi2mdc(struct llu_sb_info *sbi)
-{
-       struct obd_device *obd = class_conn2obd(&sbi->ll_mdc_conn);
-       if (obd == NULL)
-               LBUG();
-       return &obd->u.cli;
-}
-#endif
-
 static inline struct obd_export *llu_i2obdexp(struct inode *inode)
 {
         return llu_i2info(inode)->lli_sbi->ll_osc_exp;
@@ -138,16 +157,21 @@ static inline struct obd_export *llu_i2mdcexp(struct inode *inode)
         return llu_i2info(inode)->lli_sbi->ll_mdc_exp;
 }
 
+static inline int llu_is_root_inode(struct inode *inode)
+{
+        return (llu_i2info(inode)->lli_fid.id ==
+                llu_i2info(inode)->lli_sbi->ll_rootino);
+}
 
 #define LL_SAVE_INTENT(inode, it)                                              \
 do {                                                                           \
-       struct lookup_intent *temp;                                            \
+        struct lookup_intent *temp;                                            \
         LASSERT(llu_i2info(inode)->lli_it == NULL);                            \
                OBD_ALLOC(temp, sizeof(*temp));                                        \
         memcpy(temp, it, sizeof(*temp));                                       \
         llu_i2info(inode)->lli_it = temp;                                      \
         CDEBUG(D_DENTRY, "alloc intent %p to inode %p(ino %lu)\n",             \
-                       temp, inode, llu_i2info(inode)->lli_st_ino);           \
+                        temp, inode, llu_i2info(inode)->lli_st_ino);           \
 } while(0)
 
 
@@ -158,7 +182,7 @@ do {                                                                           \
         LASSERT(it);                                                           \
         llu_i2info(inode)->lli_it = NULL;                                      \
         CDEBUG(D_DENTRY, "dettach intent %p from inode %p(ino %lu)\n",         \
-                       it, inode, llu_i2info(inode)->lli_st_ino);             \
+                        it, inode, llu_i2info(inode)->lli_st_ino);             \
 } while(0)
 
 /* interpet return codes from intent lookup */
@@ -167,20 +191,20 @@ do {                                                                           \
 
 static inline void ll_inode2fid(struct ll_fid *fid, struct inode *inode)
 {
-       *fid = llu_i2info(inode)->lli_fid;
+        *fid = llu_i2info(inode)->lli_fid;
 }
 
 struct it_cb_data {
-       struct inode *icbd_parent;
-       struct pnode *icbd_child;
-       obd_id hash;
+        struct inode *icbd_parent;
+        struct pnode *icbd_child;
+        obd_id hash;
 };
 
 static inline void ll_i2uctxt(struct ll_uctxt *ctxt, struct inode *i1,
                               struct inode *i2)
 {
-       struct llu_inode_info *lli1 = llu_i2info(i1);
-       struct llu_inode_info *lli2;
+        struct llu_inode_info *lli1 = llu_i2info(i1);
+        struct llu_inode_info *lli2;
 
         LASSERT(i1);
         LASSERT(ctxt);
@@ -191,7 +215,7 @@ static inline void ll_i2uctxt(struct ll_uctxt *ctxt, struct inode *i1,
                 ctxt->gid1 = -1;
 
         if (i2) {
-               lli2 = llu_i2info(i2);
+               lli2 = llu_i2info(i2);
                 if (in_group_p(lli2->lli_st_gid))
                         ctxt->gid2 = lli2->lli_st_gid;
                 else
@@ -210,21 +234,9 @@ int llu_intent_lock(struct inode *parent, struct pnode *pnode,
 /* FIXME */
 static inline int ll_permission(struct inode *inode, int flag, void * unused)
 {
-       return 0;
-}
-
-#if 0
-static inline int it_disposition(struct lookup_intent *it, int flag)
-{
-        return it->d.lustre.it_disposition & flag;
+        return 0;
 }
 
-static inline void it_set_disposition(struct lookup_intent *it, int flag)
-{
-        it->d.lustre.it_disposition |= flag;
-}
-#endif
-
 static inline __u64 ll_file_maxbytes(struct inode *inode)
 {
         return llu_i2info(inode)->lli_maxbytes;
@@ -232,17 +244,17 @@ static inline __u64 ll_file_maxbytes(struct inode *inode)
 
 struct mount_option_s
 {
-       char *mdc_uuid;
-       char *osc_uuid;
+        char *mdc_uuid;
+        char *osc_uuid;
 };
 
 /* llite_lib.c */
 void generate_random_uuid(unsigned char uuid_out[16]);
-int liblustre_process_log(struct config_llog_instance *cfg);
+int liblustre_process_log(struct config_llog_instance *cfg, int allow_recov);
 int ll_parse_mount_target(const char *target, char **mdsnid,
                           char **mdsname, char **profile);
 
-extern int g_zconf;
+extern int     g_zconf;
 extern char   *g_zconf_mdsnid;
 extern char   *g_zconf_mdsname;
 extern char   *g_zconf_profile;
@@ -253,8 +265,6 @@ void llu_update_inode(struct inode *inode, struct mds_body *body,
                       struct lov_stripe_md *lmm);
 void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid);
 void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid);
-//struct inode* llu_new_inode(struct filesys *fs, ino_t ino, mode_t mode);
-//int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm, void *ostdata);
 int ll_it_open_error(int phase, struct lookup_intent *it);
 struct inode *llu_iget(struct filesys *fs, struct lustre_md *md);
 int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm);
@@ -282,7 +292,7 @@ int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir);
 int llu_iop_iodone(struct ioctx *ioctxp __IS_UNUSED);
 struct llu_sysio_callback_args*
 llu_file_write(struct inode *inode, const struct iovec *iovec,
-                      size_t iovlen, loff_t pos);
+                      size_t iovlen, loff_t pos);
 struct llu_sysio_callback_args*
 llu_file_read(struct inode *inode, const struct iovec *iovec,
                        size_t iovlen, loff_t pos);
@@ -308,5 +318,33 @@ int llu_iop_lookup(struct pnode *pnode,
                    const char *path);
 void unhook_stale_inode(struct pnode *pno);
 struct inode *llu_inode_from_lock(struct ldlm_lock *lock);
+int llu_mdc_blocking_ast(struct ldlm_lock *lock,
+                         struct ldlm_lock_desc *desc,
+                         void *data, int flag);
+
+/* dir.c */
+ssize_t llu_iop_getdirentries(struct inode *ino, char *buf, size_t nbytes,
+                              _SYSIO_OFF_T *basep);
+
+/* ext2 related */
+#define EXT2_NAME_LEN (255)
+
+struct ext2_dirent {
+        __u32   inode;
+        __u16   rec_len;
+        __u8    name_len;
+        __u8    file_type;
+        char    name[EXT2_NAME_LEN];
+};
+
+#define EXT2_DIR_PAD                    4
+#define EXT2_DIR_ROUND                  (EXT2_DIR_PAD - 1)
+#define EXT2_DIR_REC_LEN(name_len)      (((name_len) + 8 + EXT2_DIR_ROUND) & \
+                                         ~EXT2_DIR_ROUND)
+
+static inline struct ext2_dirent *ext2_next_entry(struct ext2_dirent *p)
+{
+        return (struct ext2_dirent*)((char*) p + le16_to_cpu(p->rec_len));
+}
 
 #endif
index 1dceb8b..1c00634 100644 (file)
 #include <inode.h>
 #include <file.h>
 
+#undef LIST_HEAD
+
 #include "llite_lib.h"
 
-static void ll_intent_release(struct lookup_intent *it)
+static void ll_intent_drop_lock(struct lookup_intent *it)
 {
         struct lustre_handle *handle;
-        ENTRY;
 
-        /* LASSERT(ll_d2d(de) != NULL); */
-
-        if (it->d.lustre.it_lock_mode) {
+        if (it->it_op && it->d.lustre.it_lock_mode) {
                 handle = (struct lustre_handle *)&it->d.lustre.it_lock_handle;
                 CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64
-                       " from it %p\n",
-                       handle->cookie, it);
+                       " from it %p\n", handle->cookie, it);
                 ldlm_lock_decref(handle, it->d.lustre.it_lock_mode);
 
-                /* intent_release may be called multiple times, from
-                   this thread and we don't want to double-decref this
-                   lock (see bug 494) */
+                /* bug 494: intent_release may be called multiple times, from
+                 * this thread and we don't want to double-decref this lock */
                 it->d.lustre.it_lock_mode = 0;
         }
-        it->it_magic = 0;
-        it->it_op_release = 0;
-        EXIT;
 }
 
-#if 0
-static void llu_mdc_lock_set_inode(struct lustre_handle *lockh,
-                                   struct inode *inode)
+static void ll_intent_release(struct lookup_intent *it)
 {
-        struct ldlm_lock *lock = ldlm_handle2lock(lockh);
         ENTRY;
 
-        LASSERT(lock != NULL);
-        lock->l_data = inode;
-        LDLM_LOCK_PUT(lock);
+        ll_intent_drop_lock(it);
+        it->it_magic = 0;
+        it->it_op_release = 0;
+        it->d.lustre.it_disposition = 0;
+        it->d.lustre.it_data = NULL;
         EXIT;
 }
 
-static int pnode_revalidate_finish(struct ptlrpc_request *request,
-                                   struct inode *parent, struct pnode *pnode,
-                                   struct lookup_intent *it, int offset,
-                                   obd_id ino)
-{
-        struct llu_sb_info    *sbi = llu_i2sbi(parent);
-        struct pnode_base     *pb = pnode->p_base;
-        struct mds_body       *body;
-        struct lov_stripe_md  *lsm = NULL;
-        struct lov_mds_md     *lmm;
-        int                    lmmsize;
-        int                    rc = 0;
-        ENTRY;
-
-        /* NB 1 request reference will be taken away by ll_intent_lock()
-         * when I return */
-
-        if (it_disposition(it, DISP_LOOKUP_NEG))
-                RETURN(-ENOENT);
-
-        /* We only get called if the mdc_enqueue() called from
-         * ll_intent_lock() was successful.  Therefore the mds_body is
-         * present and correct, and the eadata is present (but still
-         * opaque, so only obd_unpackmd() can check the size) */
-        body = lustre_msg_buf(request->rq_repmsg, offset, sizeof (*body));
-        LASSERT (body != NULL);
-        LASSERT_REPSWABBED (request, offset);
-
-        if (body->valid & OBD_MD_FLEASIZE) {
-                /* Only bother with this if inodes's LSM not set? */
-
-                if (body->eadatasize == 0) {
-                        CERROR ("OBD_MD_FLEASIZE set, but eadatasize 0\n");
-                        GOTO (out, rc = -EPROTO);
-                }
-                lmmsize = body->eadatasize;
-                lmm = lustre_msg_buf (request->rq_repmsg, offset + 1, lmmsize);
-                LASSERT (lmm != NULL);
-                LASSERT_REPSWABBED (request, offset + 1);
-
-                rc = obd_unpackmd (&sbi->ll_osc_conn,
-                                   &lsm, lmm, lmmsize);
-                if (rc < 0) {
-                        CERROR ("Error %d unpacking eadata\n", rc);
-                        LBUG();
-                        /* XXX don't know if I should do this... */
-                        GOTO (out, rc);
-                        /* or skip the ll_update_inode but still do
-                         * mdc_lock_set_inode() */
-                }
-                LASSERT (rc >= sizeof (*lsm));
-                rc = 0;
-        }
-
-        llu_update_inode(pb->pb_ino, body, lsm);
-
-        if (lsm != NULL &&
-            llu_i2info(pb->pb_ino)->lli_smd != lsm)
-                obd_free_memmd (&sbi->ll_osc_conn, &lsm);
-
-        llu_mdc_lock_set_inode((struct lustre_handle *)&it->d.lustre.it_lock_handle,
-                               pb->pb_ino);
- out:
-        RETURN(rc);
-}
-#endif
-
 /*
  * remove the stale inode from pnode
  */
@@ -156,11 +82,11 @@ void unhook_stale_inode(struct pnode *pno)
         LASSERT(llu_i2info(inode)->lli_stale_flag);
 
         pno->p_base->pb_ino = NULL;
+        I_RELE(inode);
 
         if (!llu_i2info(inode)->lli_open_count) {
                 CDEBUG(D_INODE, "unhook inode %p (ino %lu) from pno %p\n",
                                 inode, llu_i2info(inode)->lli_st_ino, pno);
-                I_RELE(inode);
                 if (!inode->i_ref)
                         _sysio_i_gone(inode);
         }
@@ -188,14 +114,14 @@ void llu_lookup_finish_locks(struct lookup_intent *it, struct pnode *pnode)
 
 }
 
-static inline void ll_invalidate_inode_pages(struct inode * inode)
+static inline void llu_invalidate_inode_pages(struct inode * inode)
 {
         /* do nothing */
 }
 
-static int llu_mdc_blocking_ast(struct ldlm_lock *lock,
-                                struct ldlm_lock_desc *desc,
-                                void *data, int flag)
+int llu_mdc_blocking_ast(struct ldlm_lock *lock,
+                         struct ldlm_lock_desc *desc,
+                         void *data, int flag)
 {
         int rc;
         struct lustre_handle lockh;
@@ -232,7 +158,7 @@ static int llu_mdc_blocking_ast(struct ldlm_lock *lock,
                         CDEBUG(D_INODE, "invalidating inode %lu\n",
                                lli->lli_st_ino);
 
-                        ll_invalidate_inode_pages(inode);
+                        llu_invalidate_inode_pages(inode);
                 }
 
 /*
@@ -250,6 +176,33 @@ static int llu_mdc_blocking_ast(struct ldlm_lock *lock,
         RETURN(0);
 }
 
+static int pnode_revalidate_finish(struct ptlrpc_request *req,
+                                   int offset,
+                                   struct lookup_intent *it,
+                                   struct pnode *pnode)
+{
+        struct inode *inode = pnode->p_base->pb_ino;
+        struct lustre_md md;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(inode);
+
+        if (!req)
+                RETURN(0);
+
+        if (it_disposition(it, DISP_LOOKUP_NEG))
+                RETURN(-ENOENT);
+
+        rc = mdc_req2lustre_md(req, offset, llu_i2sbi(inode)->ll_osc_exp, &md);
+        if (rc)
+                RETURN(rc);
+
+        llu_update_inode(inode, md.body, md.lsm);
+
+        RETURN(rc);
+}
+
 int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it)
 {
         struct pnode_base *pb = pnode->p_base;
@@ -285,7 +238,7 @@ int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it)
         }
 
         /* This is due to bad interaction with libsysio. remove this when we
-         * switched to libbsdio
+         * switched to libbsdio XXX
          */
         {
                 struct llu_inode_info *lli = llu_i2info(pb->pb_ino);
@@ -322,14 +275,13 @@ int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it)
         if (req == NULL && rc >= 0)
                 GOTO(out, rc);
 
-        /* unfortunately ll_intent_lock may cause a callback and revoke our
-           dentry */
-        /*
-        spin_lock(&dcache_lock);
-        list_del_init(&de->d_hash);
-        spin_unlock(&dcache_lock);
-        d_rehash(de);
-        */
+        if (rc < 0)
+                GOTO(out, rc = 0);
+
+        rc = pnode_revalidate_finish(req, 1, it, pnode);
+
+        /* Note: ll_intent_lock may cause a callback, check this! */
+
         if (it->it_op & (IT_OPEN | IT_GETATTR))
                 LL_SAVE_INTENT(pb->pb_ino, it);
         RETURN(1);
@@ -339,7 +291,7 @@ int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it)
         if (rc == 0) {
                 LASSERT(pb->pb_ino);
                 if (S_ISDIR(llu_i2info(pb->pb_ino)->lli_st_mode))
-                        ll_invalidate_inode_pages(pb->pb_ino);
+                        llu_invalidate_inode_pages(pb->pb_ino);
                 llu_i2info(pb->pb_ino)->lli_stale_flag = 1;
                 unhook_stale_inode(pnode);
         } else {
@@ -362,8 +314,9 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset,
         int rc;
 
         /* NB 1 request reference will be taken away by ll_intent_lock()
-         * when I return */
-        /* XXX libsysio require the inode must be generated here XXX */
+         * when I return 
+         * Note: libsysio require the inode must be generated here
+         */
         if ((it->it_op & IT_CREAT) || !it_disposition(it, DISP_LOOKUP_NEG)) {
                 struct lustre_md md;
                 struct llu_inode_info *lli;
@@ -396,6 +349,9 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset,
 
                         LASSERT(lsm->lsm_object_id != 0);
 
+                        /* bug 2334: drop MDS lock before acquiring OST lock */
+                        ll_intent_drop_lock(it);
+
                         rc = llu_extent_lock(NULL, inode, lsm, LCK_PR, &extent,
                                             &lockh);
                         if (rc != ELDLM_OK) {
@@ -408,15 +364,10 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset,
                 ENTRY;
         }
 
+        /* intent will be further used in cases of open()/getattr() */
         if (inode && (it->it_op & (IT_OPEN | IT_GETATTR)))
                 LL_SAVE_INTENT(inode, it);
-/*
-        dentry->d_op = &ll_d_ops;
-        ll_set_dd(dentry);
 
-        if (dentry == saved)
-                d_add(dentry, inode);
-*/
         child->p_base->pb_ino = inode;
 
         RETURN(0);
@@ -437,9 +388,6 @@ struct inode *llu_inode_from_lock(struct ldlm_lock *lock)
         return inode;
 }
 
-/* XXX */
-#define EXT2_NAME_LEN (255)
-
 static int llu_lookup_it(struct inode *parent, struct pnode *pnode,
                          struct lookup_intent *it, int flags)
 {
@@ -454,18 +402,6 @@ static int llu_lookup_it(struct inode *parent, struct pnode *pnode,
         if (pnode->p_base->pb_name.len > EXT2_NAME_LEN)
                 RETURN(-ENAMETOOLONG);
 
-
-/*
-        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
-               dentry->d_name.name, parent->i_ino, parent->i_generation,
-               parent, LL_IT2STR(it));
-
-        if (d_mountpoint(dentry))
-                CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
-
-        ll_frob_intent(&it, &lookup_it);
-*/
-
         if (!it) {
                 it = &lookup_it;
                 it->it_op_release = ll_intent_release;
@@ -493,12 +429,6 @@ static int llu_lookup_it(struct inode *parent, struct pnode *pnode,
 
         llu_lookup_finish_locks(it, pnode);
 
-/*
-        if (dentry == save)
-                GOTO(out, retval = NULL);
-        else
-                GOTO(out, retval = dentry);
-*/
  out:
         if (req)
                 ptlrpc_req_finished(req);
@@ -553,12 +483,7 @@ translate_lookup_intent(struct intent *intent, const char *path)
                 it->it_flags |= fmode;
         }
 
-        /*
-        else if (intent->int_opmask & INT_CREAT)
-                it->it_op |= IT_LOOKUP;
-         */
-
-        /* FIXME libsysio has strange code on intent handling,
+        /* XXX libsysio has strange code on intent handling,
          * more check later */
         if (it->it_flags & O_CREAT) {
                 it->it_op |= IT_CREAT;
@@ -567,9 +492,8 @@ translate_lookup_intent(struct intent *intent, const char *path)
 
         if (intent->int_opmask & INT_GETATTR)
                 it->it_op |= IT_GETATTR;
-        /* XXX */
-        if (intent->int_opmask & INT_SETATTR)
-                LBUG();
+
+        LASSERT(!(intent->int_opmask & INT_SETATTR));
 
         /* libsysio is different to linux vfs when doing unlink/rmdir,
          * INT_UPDPARENT was passed down during name resolution. Here
index c05a7c9..c07409e 100644 (file)
 #include <inode.h>
 #include <file.h>
 
-#include "llite_lib.h"
-
-#if 0
-void llu_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
-                               struct ldlm_lock *lock)
-{
-        clear_bit(LLI_F_HAVE_SIZE_LOCK, &(llu_i2info(inode)->lli_flags));
-#if 0
-        struct ldlm_extent *extent = &lock->l_extent;
-        unsigned long start, end, count, skip, i, j;
-        struct page *page;
-        int ret;
-        ENTRY;
-
-        CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n",
-               inode->i_ino, inode, extent->start, extent->end, inode->i_size);
-
-        start = extent->start >> PAGE_CACHE_SHIFT;
-        count = ~0;
-        skip = 0;
-        end = (extent->end >> PAGE_CACHE_SHIFT) + 1;
-        if ((end << PAGE_CACHE_SHIFT) < extent->end)
-                end = ~0;
-        if (lsm->lsm_stripe_count > 1) {
-                struct {
-                        char name[16];
-                        struct ldlm_lock *lock;
-                        struct lov_stripe_md *lsm;
-                } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
-                __u32 stripe;
-                __u32 vallen = sizeof(stripe);
-                int rc;
-
-                /* get our offset in the lov */
-                rc = obd_get_info(ll_i2obdconn(inode), sizeof(key),
-                                  &key, &vallen, &stripe);
-                if (rc != 0) {
-                        CERROR("obd_get_info: rc = %d\n", rc);
-                        LBUG();
-                }
-                LASSERT(stripe < lsm->lsm_stripe_count);
-
-                count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
-                skip = (lsm->lsm_stripe_count - 1) * count;
-                start += (start/count * skip) + (stripe * count);
-                if (end != ~0)
-                        end += (end/count * skip) + (stripe * count);
-        }
-
-        i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
-        if (end >= i)
-                clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags));
-        if (i < end)
-                end = i;
-
-        CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n",
-               start, start % count, count, skip, end);
-
-        /* start writeback on dirty pages in the extent when its PW */
-        for (i = start, j = start % count;
-                        lock->l_granted_mode == LCK_PW && i < end; j++, i++) {
-                if (j == count) {
-                        i += skip;
-                        j = 0;
-                }
-                /* its unlikely, but give us a chance to bail when we're out */
-                PGCACHE_WRLOCK(inode->i_mapping);
-                if (list_empty(&inode->i_mapping->dirty_pages)) {
-                        CDEBUG(D_INODE, "dirty list empty\n");
-                        PGCACHE_WRUNLOCK(inode->i_mapping);
-                        break;
-                }
-                PGCACHE_WRUNLOCK(inode->i_mapping);
-
-                if (need_resched())
-                        schedule();
-
-        /* always do a getattr for the first person to pop out of lock
-         * acquisition.. the DID_GETATTR flag and semaphore serialize
-         * this initial race.  we used to make a decision based on whether
-         * the lock was matched or acquired, but the matcher could win the
-         * waking race with the first issuer so that was no good..
-         */
-        if (test_bit(LLI_F_DID_GETATTR, &lli->lli_flags))
-                RETURN(ELDLM_OK);
-
-        down(&lli->lli_getattr_sem);
-
-        if (!test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) {
-                rc = ll_inode_getattr(inode, lsm);
-                if (rc == 0) {
-                        set_bit(LLI_F_DID_GETATTR, &lli->lli_flags);
-                } else {
-                        unlock_page(page);
-                }
-                page_cache_release(page);
-
-        }
-
-        /* our locks are page granular thanks to osc_enqueue, we invalidate the
-         * whole page. */
-        LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0);
-        LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0);
-        for (i = start, j = start % count ; i < end ; j++, i++) {
-                if ( j == count ) {
-                        i += skip;
-                        j = 0;
-                }
-                PGCACHE_WRLOCK(inode->i_mapping);
-                if (list_empty(&inode->i_mapping->dirty_pages) &&
-                     list_empty(&inode->i_mapping->clean_pages) &&
-                     list_empty(&inode->i_mapping->locked_pages)) {
-                        CDEBUG(D_INODE, "nothing left\n");
-                        PGCACHE_WRUNLOCK(inode->i_mapping);
-                        break;
-                }
-                PGCACHE_WRUNLOCK(inode->i_mapping);
-                if (need_resched())
-                        schedule();
-                page = find_get_page(inode->i_mapping, i);
-                if (page == NULL)
-                        continue;
-                CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index);
-                lock_page(page);
-                if (page->mapping) /* might have raced */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                        truncate_complete_page(page);
-#else
-                        truncate_complete_page(page->mapping, page);
-#endif                
-                unlock_page(page);
-                page_cache_release(page);
-        }
-        EXIT;
-#endif
-}
-
-int llu_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
-                      void *data, int flag)
-{
-        struct inode *inode = data;
-        struct llu_inode_info *lli = llu_i2info(inode);
-        struct lustre_handle lockh = {0};
-        int rc;
-        ENTRY;
-
-        if (inode == NULL)
-                LBUG();
-
-        switch (flag) {
-        case LDLM_CB_BLOCKING:
-                ldlm_lock2handle(lock, &lockh);
-                rc = ldlm_cli_cancel(&lockh);
-                if (rc != ELDLM_OK)
-                        CERROR("ldlm_cli_cancel failed: %d\n", rc);
-                break;
-        case LDLM_CB_CANCELING: {
-                /* FIXME: we could be given 'canceling intents' so that we
-                 * could know to write-back or simply throw away the pages
-                 * based on if the cancel comes from a desire to, say,
-                 * read or truncate.. */
-                llu_pgcache_remove_extent(inode, lli->lli_smd, lock);
-                break;
-        }
-        default:
-                LBUG();
-        }
+#undef LIST_HEAD
 
-        RETURN(0);
-}
-#endif
+#include "llite_lib.h"
 
 static int llu_extent_lock_callback(struct ldlm_lock *lock,
                                     struct ldlm_lock_desc *new, void *data,
@@ -274,12 +106,10 @@ int llu_extent_lock_no_validate(struct ll_file_data *fd,
 
         LASSERT(lockh->cookie == 0);
 
-#if 0
         /* XXX phil: can we do this?  won't it screw the file size up? */
         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
             (sbi->ll_flags & LL_SBI_NOLCK))
                 RETURN(0);
-#endif
 
         CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
                lli->lli_st_ino, extent->start, extent->end);
@@ -432,17 +262,17 @@ static struct obd_async_page_ops llu_async_page_ops = {
 };
 
 static
-struct llu_sysio_cookie* get_sysio_cookie(struct inode *inode, int npages)
+struct llu_sysio_cookie* get_sysio_cookie(struct inode *inode, int maxpages)
 {
         struct llu_sysio_cookie *cookie;
 
-        OBD_ALLOC(cookie, LLU_SYSIO_COOKIE_SIZE(npages));
+        OBD_ALLOC(cookie, LLU_SYSIO_COOKIE_SIZE(maxpages));
         if (cookie) {
                 I_REF(inode);
                 cookie->lsc_inode = inode;
-                cookie->lsc_npages = npages;
+                cookie->lsc_maxpages = maxpages;
                 cookie->lsc_llap = (struct ll_async_page *)(cookie + 1);
-                cookie->lsc_pages = (struct page *) (cookie->lsc_llap + npages);
+                cookie->lsc_pages = (struct page *) (cookie->lsc_llap + maxpages);
 
                 osic_init(&cookie->lsc_osic);
         }
@@ -456,25 +286,114 @@ void put_sysio_cookie(struct llu_sysio_cookie *cookie)
         struct lov_stripe_md *lsm = llu_i2info(cookie->lsc_inode)->lli_smd;
         struct obd_export *exp = llu_i2obdexp(cookie->lsc_inode);
         struct ll_async_page *llap = cookie->lsc_llap;
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+        struct page *pages = cookie->lsc_pages;
+#endif
         int i;
 
-        for (i = 0; i< cookie->lsc_npages; i++) {
+        for (i = 0; i< cookie->lsc_maxpages; i++) {
                 if (llap[i].llap_cookie)
                         obd_teardown_async_page(exp, lsm, NULL,
                                                 llap[i].llap_cookie);
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+                if (pages[i]._managed) {
+                        free(pages[i].addr);
+                        pages[i]._managed = 0;
+                }
+#endif
         }
 
         I_RELE(cookie->lsc_inode);
 
         osic_release(cookie->lsc_osic);
-        OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(cookie->lsc_npages));
+        OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(cookie->lsc_maxpages));
+}
+
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+/* Note: these code should be removed finally, don't need
+ * more cleanup
+ */
+static
+int prepare_unaligned_write(struct llu_sysio_cookie *cookie)
+{
+        struct inode *inode = cookie->lsc_inode;
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct lov_stripe_md *lsm = lli->lli_smd;
+        struct obdo oa;
+        struct page *pages = cookie->lsc_pages;
+        int i, pgidx[2] = {0, cookie->lsc_npages-1};
+        int rc;
+        ENTRY;
+
+        for (i = 0; i < 2; i++) {
+                struct page *oldpage = &pages[pgidx[i]];
+                struct page newpage;
+                struct brw_page pg;
+                char *newbuf;
+
+                if (i == 0 && pgidx[0] == pgidx[1])
+                        continue;
+
+                LASSERT(oldpage->_offset + oldpage->_count <= PAGE_CACHE_SIZE);
+
+                if (oldpage->_count == PAGE_CACHE_SIZE)
+                        continue;
+
+                if (oldpage->index << PAGE_CACHE_SHIFT >=
+                    lli->lli_st_size)
+                        continue;
+
+                newbuf = malloc(PAGE_CACHE_SIZE);
+                if (!newbuf)
+                        return -ENOMEM;
+
+                newpage.index = oldpage->index;
+                newpage.addr = newbuf;
+
+                pg.pg = &newpage;
+                pg.off = ((obd_off)newpage.index << PAGE_CACHE_SHIFT);
+                if (pg.off + PAGE_CACHE_SIZE > lli->lli_st_size)
+                        pg.count = lli->lli_st_size % PAGE_CACHE_SIZE;
+                else
+                        pg.count = PAGE_CACHE_SIZE;
+                pg.flag = 0;
+
+                oa.o_id = lsm->lsm_object_id;
+                oa.o_mode = lli->lli_st_mode;
+                oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
+
+                /* issue read */
+                rc = obd_brw(OBD_BRW_READ, llu_i2obdexp(inode), &oa, lsm, 1, &pg, NULL);
+                if (rc) {
+                        free(newbuf);
+                        RETURN(rc);
+                }
+
+                /* copy page content, and reset page params */
+                memcpy(newbuf + oldpage->_offset,
+                       (char*)oldpage->addr + oldpage->_offset,
+                       oldpage->_count);
+
+                oldpage->addr = newbuf;
+                if ((((obd_off)oldpage->index << PAGE_CACHE_SHIFT) +
+                    oldpage->_offset + oldpage->_count) > lli->lli_st_size)
+                        oldpage->_count += oldpage->_offset;
+                else
+                        oldpage->_count = PAGE_CACHE_SIZE;
+                oldpage->_offset = 0;
+                oldpage->_managed = 1;
+        }
+
+        RETURN(0);
 }
+#endif
 
 static
 int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd,
                       char *buf, loff_t pos, size_t count)
 {
-        struct lov_stripe_md *lsm = llu_i2info(cookie->lsc_inode)->lli_smd;
+        struct llu_inode_info *lli = llu_i2info(cookie->lsc_inode);
+        struct lov_stripe_md *lsm = lli->lli_smd;
         struct obd_export *exp = llu_i2obdexp(cookie->lsc_inode);
         struct page *pages = cookie->lsc_pages;
         struct ll_async_page *llap = cookie->lsc_llap;
@@ -484,8 +403,6 @@ int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd,
         if (!exp)
                 RETURN(-EINVAL);
 
-        cookie->lsc_rwcount = count;
-
         /* prepare the pages array */
        do {
                 unsigned long index, offset, bytes;
@@ -496,6 +413,14 @@ int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd,
                 if (bytes > count)
                         bytes = count;
 
+                /* prevent read beyond file range */
+                if ((cmd == OBD_BRW_READ) &&
+                    (pos + bytes) >= lli->lli_st_size) {
+                        if (pos >= lli->lli_st_size)
+                                break;
+                        bytes = lli->lli_st_size - pos;
+                }
+
                 /* prepare page for this index */
                 pages[npages].index = index;
                 pages[npages].addr = buf - offset;
@@ -507,8 +432,20 @@ int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd,
                 count -= bytes;
                 pos += bytes;
                 buf += bytes;
+
+                cookie->lsc_rwcount += bytes;
         } while (count);
 
+        cookie->lsc_npages = npages;
+
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+        if (cmd == OBD_BRW_WRITE) {
+                rc = prepare_unaligned_write(cookie);
+                if (rc)
+                        RETURN(rc);
+        }
+#endif
+
         for (i = 0; i < npages; i++) {
                 llap[i].llap_magic = LLAP_MAGIC;
                 rc = obd_prep_async_page(exp, lsm, NULL, &pages[i],
@@ -612,28 +549,30 @@ llu_file_write(struct inode *inode, const struct iovec *iovec,
 
         /* FIXME optimize the following extent locking */
         for (iovidx = 0; iovidx < iovlen; iovidx++) {
-                char *buf = iovec[iovidx].iov_base;
+                char *buf = (char*)iovec[iovidx].iov_base;
                 size_t count = iovec[iovidx].iov_len;
 
                 if (count == 0)
                         continue;
 
-                /* FIXME libsysio haven't consider the open flags
-                 * such as O_APPEND */
-#if 0
-                if (!S_ISBLK(lli->lli_st_mode) && file->f_flags & O_APPEND) {
-                        extent.start = 0;
-                        extent.end = OBD_OBJECT_EOF;
-                } else  {
-                        extent.start = *ppos;
-                        extent.end = *ppos + count - 1;
-                }
-#else
+                /* FIXME libsysio haven't handle O_APPEND */
                 extent.start = pos;
                 extent.end = pos + count - 1;
-#endif
 
-                err = llu_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh);
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+                if ((pos & ~PAGE_CACHE_MASK) == 0 &&
+                    (count & ~PAGE_CACHE_MASK) == 0)
+                        err = llu_extent_lock_no_validate(fd, inode, lsm,
+                                                LCK_PW, &extent, &lockh, 0);
+                else
+                        err = llu_extent_lock(fd, inode, lsm, LCK_PW,
+                                                &extent, &lockh);
+#else
+                /* server will handle partial write, so we don't
+                 * care for file size here */
+                err = llu_extent_lock_no_validate(fd, inode, lsm, LCK_PW,
+                                                &extent, &lockh, 0);
+#endif
                 if (err != ELDLM_OK)
                         GOTO(err_out, err = -ENOLCK);
 
@@ -737,6 +676,11 @@ llu_file_read(struct inode *inode, const struct iovec *iovec,
                 CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n",
                        lli->lli_st_ino, count, pos);
 
+                if (pos >= lli->lli_st_size) {
+                        llu_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
+                        break;
+                }
+
                 cookie = llu_rw(OBD_BRW_READ, inode, buf, count, pos);
                 if (!IS_ERR(cookie)) {
                         /* save cookie */
@@ -776,8 +720,10 @@ int llu_iop_iodone(struct ioctx *ioctxp)
         ENTRY;
 
         /* write/read(fd, buf, 0) */
-        if (!lsca)
-                return 1;
+        if (!lsca) {
+                ioctxp->ioctx_cc = 0;
+                RETURN(1);
+        }
 
         LASSERT(!IS_ERR(lsca));
 
@@ -793,8 +739,11 @@ int llu_iop_iodone(struct ioctx *ioctxp)
                 }
         }
 
-        if (rc)
-                ioctxp->ioctx_cc = rc;
+        if (rc) {
+                LASSERT(rc < 0);
+                ioctxp->ioctx_cc = -1;
+                ioctxp->ioctx_errno = -rc;
+        }
 
         OBD_FREE(lsca, sizeof(*lsca));
         ioctxp->ioctx_private = NULL;
index 2bd8248..1e6a230 100644 (file)
@@ -43,6 +43,8 @@
 #include <inode.h>
 #include <file.h>
 
+#undef LIST_HEAD
+
 #include "llite_lib.h"
 
 static void llu_fsop_gone(struct filesys *fs)
@@ -293,14 +295,6 @@ int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm)
 
         obdo_refresh_inode(inode, &oa, refresh_valid);
 
-/*
-        if (inode->i_blksize < PAGE_CACHE_SIZE)
-                inode->i_blksize = PAGE_CACHE_SIZE;
-
-        CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
-               lsm->lsm_object_id, inode->i_size, inode->i_blocks,
-               inode->i_blksize);
-*/
         RETURN(0);
 }
 
@@ -343,49 +337,6 @@ static struct inode* llu_new_inode(struct filesys *fs,
         return inode;
 }
 
-#if 0
-static int ll_intent_to_lock_mode(struct lookup_intent *it)
-{
-        /* CREAT needs to be tested before open (both could be set) */
-        if (it->it_op & IT_CREAT)
-                return LCK_PW;
-        else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP))
-                return LCK_PR;
-
-        LBUG();
-        RETURN(-EINVAL);
-}
-#endif
-
-#if 0
-int ll_it_open_error(int phase, struct lookup_intent *it)
-{
-        if (it_disposition(it, DISP_OPEN_OPEN)) {
-                if (phase == DISP_OPEN_OPEN)
-                        return it->d.lustre.it_status;
-                else
-                        return 0;
-        }
-
-        if (it_disposition(it, DISP_OPEN_CREATE)) {
-                if (phase == DISP_OPEN_CREATE)
-                        return it->d.lustre.it_status;
-                else
-                        return 0;
-        }
-
-        if (it_disposition(it, DISP_LOOKUP_EXECD)) {
-                if (phase == DISP_LOOKUP_EXECD)
-                        return it->d.lustre.it_status;
-                else
-                        return 0;
-        }
-        CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition, it->d.lustre.it_status);
-        LBUG();
-        return 0;
-}
-#endif
-
 static int llu_have_md_lock(struct inode *inode)
 {
         struct llu_sb_info *sbi = llu_i2sbi(inode);
@@ -653,8 +604,6 @@ out:
  * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
  * at the same time.
  */
-#define OST_ATTR (ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME | \
-                  ATTR_ATIME | ATTR_ATIME_SET | ATTR_SIZE)
 int llu_setattr_raw(struct inode *inode, struct iattr *attr)
 {
         struct lov_stripe_md *lsm = llu_i2info(inode)->lli_smd;
@@ -702,7 +651,7 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr)
         /* If only OST attributes being set on objects, don't do MDS RPC.
          * In that case, we need to check permissions and update the local
          * inode ourselves so we can call obdo_from_inode() always. */
-        if (ia_valid & (lsm ? ~(OST_ATTR | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) {
+        if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) {
                 struct lustre_md md;
                 llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
 
@@ -810,7 +759,7 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr)
         RETURN(rc);
 }
 
-/* FIXME here we simply act as a thin layer to glue it with
+/* here we simply act as a thin layer to glue it with
  * llu_setattr_raw(), which is copy from kernel
  */
 static int llu_iop_setattr(struct pnode *pno,
@@ -844,7 +793,7 @@ static int llu_iop_setattr(struct pnode *pno,
                 iattr.ia_valid |= ATTR_GID;
         }
         if (mask & SETATTR_LEN) {
-                iattr.ia_size = stbuf->st_size; /* FIXME signed expansion problem */
+                iattr.ia_size = stbuf->st_size; /* XXX signed expansion problem */
                 iattr.ia_valid |= ATTR_SIZE;
         }
 
@@ -950,10 +899,6 @@ static int llu_iop_readlink(struct pnode *pno, char *data, size_t bufsize)
         int rc;
         ENTRY;
 
-        /* on symlinks lli_open_sem protects lli_symlink_name allocation/data */
-/*
-        down(&lli->lli_open_sem);
-*/
         rc = llu_readlink_internal(inode, &request, &symname);
         if (rc)
                 GOTO(out, rc);
@@ -963,9 +908,6 @@ static int llu_iop_readlink(struct pnode *pno, char *data, size_t bufsize)
 
         ptlrpc_req_finished(request);
  out:
-/*
-        up(&lli->lli_open_sem);
-*/
         RETURN(rc);
 }
 
@@ -1013,80 +955,6 @@ static int llu_iop_mknod_raw(struct pnode *pno,
         RETURN(err);
 }
 
-#if 0
-static int llu_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode,
-                         const char *name, int len)
-{
-        struct ptlrpc_request *request = NULL;
-        struct mds_body *body;
-        struct lov_mds_md *eadata;
-        struct lov_stripe_md *lsm = NULL;
-        struct obd_trans_info oti = { 0 };
-        struct mdc_op_data op_data;
-        struct obdo *oa;
-        int rc;
-        ENTRY;
-
-        llu_prepare_mdc_op_data(&op_data, dir, child, name, len, mode);
-        rc = mdc_unlink(&llu_i2sbi(dir)->ll_mdc_conn, &op_data, &request);
-        if (rc)
-                GOTO(out, rc);
-        /* req is swabbed so this is safe */
-        body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body));
-
-        if (!(body->valid & OBD_MD_FLEASIZE))
-                GOTO(out, rc = 0);
-
-        if (body->eadatasize == 0) {
-                CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n");
-                GOTO(out, rc = -EPROTO);
-        }
-
-        /* The MDS sent back the EA because we unlinked the last reference
-         * to this file. Use this EA to unlink the objects on the OST.
-         * It's opaque so we don't swab here; we leave it to obd_unpackmd() to
-         * check it is complete and sensible. */
-        eadata = lustre_swab_repbuf(request, 1, body->eadatasize, NULL);
-        LASSERT(eadata != NULL);
-        if (eadata == NULL) {
-                CERROR("Can't unpack MDS EA data\n");
-                GOTO(out, rc = -EPROTO);
-        }
-
-        rc = obd_unpackmd(llu_i2obdconn(dir), &lsm, eadata, body->eadatasize);
-        if (rc < 0) {
-                CERROR("obd_unpackmd: %d\n", rc);
-                GOTO(out, rc);
-        }
-        LASSERT(rc >= sizeof(*lsm));
-
-        oa = obdo_alloc();
-        if (oa == NULL)
-                GOTO(out_free_memmd, rc = -ENOMEM);
-
-        oa->o_id = lsm->lsm_object_id;
-        oa->o_mode = body->mode & S_IFMT;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
-
-        if (body->valid & OBD_MD_FLCOOKIE) {
-                oa->o_valid |= OBD_MD_FLCOOKIE;
-                oti.oti_logcookies = lustre_msg_buf(request->rq_repmsg, 3,
-                                                    body->eadatasize);
-        }
-
-        rc = obd_destroy(llu_i2obdconn(dir), oa, lsm, &oti);
-        obdo_free(oa);
-        if (rc)
-                CERROR("obd destroy objid 0x"LPX64" error %d\n",
-                       lsm->lsm_object_id, rc);
- out_free_memmd:
-        obd_free_memmd(llu_i2obdconn(dir), &lsm);
- out:
-        ptlrpc_req_finished(request);
-        return rc;
-}
-#endif
-
 static int llu_iop_link_raw(struct pnode *old, struct pnode *new)
 {
         struct inode *src = old->p_base->pb_ino;
@@ -1176,7 +1044,7 @@ static int llu_iop_rename_raw(struct pnode *old, struct pnode *new)
         RETURN(rc);
 }
 
-#if 0
+#ifdef _HAVE_STATVFS
 static int llu_statfs_internal(struct llu_sb_info *sbi,
                                struct obd_statfs *osfs,
                                unsigned long max_age)
@@ -1221,7 +1089,7 @@ static int llu_statfs_internal(struct llu_sb_info *sbi,
         RETURN(rc);
 }
 
-static int llu_statfs(struct llu_sb_info *sbi, struct kstatfs *sfs)
+static int llu_statfs(struct llu_sb_info *sbi, struct statfs *sfs)
 {
         struct obd_statfs osfs;
         int rc;
@@ -1284,7 +1152,7 @@ static int llu_iop_statvfs(struct pnode *pno,
 
         RETURN(0);
 }
-#endif
+#endif /* _HAVE_STATVFS */
 
 static int llu_iop_mkdir_raw(struct pnode *pno, mode_t mode)
 {
@@ -1449,7 +1317,6 @@ llu_fsswop_mount(const char *source,
                         GOTO(out_free, err = -EINVAL);
                 }
 
-                /* XXX */
                 /* generate a string unique to this super, let's try
                  the address of the super itself.*/
                 len = (sizeof(sbi) * 2) + 1; 
@@ -1460,7 +1327,7 @@ llu_fsswop_mount(const char *source,
 
                 cfg.cfg_instance = sbi->ll_instance;
                 cfg.cfg_uuid = sbi->ll_sb_uuid;
-                err = liblustre_process_log(&cfg);
+                err = liblustre_process_log(&cfg, 1);
                 if (err < 0) {
                         CERROR("Unable to process log: %s\n", g_zconf_profile);
 
@@ -1622,7 +1489,7 @@ static struct inode_ops llu_inode_ops = {
         inop_lookup:    llu_iop_lookup,
         inop_getattr:   llu_iop_getattr,
         inop_setattr:   llu_iop_setattr,
-        inop_getdirentries:     NULL,
+        inop_getdirentries:     llu_iop_getdirentries,
         inop_mkdir:     llu_iop_mkdir_raw,
         inop_rmdir:     llu_iop_rmdir_raw,
         inop_symlink:   llu_iop_symlink_raw,
@@ -1640,7 +1507,7 @@ static struct inode_ops llu_inode_ops = {
         inop_datasync:  llu_iop_datasync,
         inop_ioctl:     llu_iop_ioctl,
         inop_mknod:     llu_iop_mknod_raw,
-#if 0
+#ifdef _HAVE_STATVFS
         inop_statvfs:   llu_iop_statvfs,
 #endif
         inop_gone:      llu_iop_gone,
diff --git a/lustre/liblustre/tests/.cvsignore b/lustre/liblustre/tests/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/liblustre/tests/Makefile.am b/lustre/liblustre/tests/Makefile.am
new file mode 100644 (file)
index 0000000..a99a4bb
--- /dev/null
@@ -0,0 +1,45 @@
+## Liblustre excecutables & libraries Makefile
+DEFS=
+
+CFLAGS := -g -Wall -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include \
+          -I$(top_srcdir)/portals/unals -I$(SYSIO)/include \
+          -I/opt/lam/include -L/opt/lam/lib
+
+KFLAGS:=
+CPPFLAGS = $(HAVE_EFENCE) -D_LARGEFILE64_SOURCE=1
+LIBS = $(LIBEFENCE)
+
+
+LLIB_EXEC= ../liblustre.a -lpthread
+
+noinst_LIBRARIES = libtestcommon.a
+libtestcommon_a_SOURCES = test_common.c
+
+bin_PROGRAMS = echo_test sanity recovery_small replay_single test_lock_cancel \
+    replay_ost_single
+
+echo_test_SOURCES = echo_test.c  ../../utils/parser.c ../../utils/obd.c ../../utils/lustre_cfg.c
+echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lpthread 
+echo_test_DEPENDENCIES=$(top_srcdir)/liblustre/liblsupport.a
+
+sanity_SOURCES = sanity.c
+sanity_LDADD := ./libtestcommon.a $(LLIB_EXEC)
+sanity_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a ./libtestcommon.a
+
+recovery_small_SOURCES = recovery_small.c
+recovery_small_LDADD := ./libtestcommon.a $(LLIB_EXEC) 
+recovery_small_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a
+
+replay_single_SOURCES = replay_single.c
+replay_single_LDADD := ./libtestcommon.a $(LLIB_EXEC)
+replay_single_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a
+
+test_lock_cancel_SOURCES = test_lock_cancel.c
+test_lock_cancel_LDADD :=  $(LLIB_EXEC) -lmpi -llam
+
+replay_ost_single_SOURCES = replay_ost_single.c
+replay_ost_single_LDADD := ./libtestcommon.a $(LLIB_EXEC)
+replay_ost_single_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a
+
+include $(top_srcdir)/Rules
+
diff --git a/lustre/liblustre/tests/echo_test.c b/lustre/liblustre/tests/echo_test.c
new file mode 100644 (file)
index 0000000..51bf60f
--- /dev/null
@@ -0,0 +1,368 @@
+#include <stdio.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+
+#include <portals/api-support.h> /* needed for ptpctl.h */
+#include <portals/ptlctl.h>    /* needed for parse_dump */
+
+
+#include <liblustre.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <procbridge.h>
+
+#define LIBLUSTRE_TEST 1
+#include "../utils/lctl.c"
+
+struct ldlm_namespace;
+struct ldlm_res_id;
+struct obd_import;
+
+void *inter_module_get(char *arg)
+{
+        if (!strcmp(arg, "tcpnal_ni"))
+                return &tcpnal_ni;
+        else if (!strcmp(arg, "ldlm_cli_cancel_unused"))
+                return ldlm_cli_cancel_unused;
+        else if (!strcmp(arg, "ldlm_namespace_cleanup"))
+                return ldlm_namespace_cleanup;
+        else if (!strcmp(arg, "ldlm_replay_locks"))
+                return ldlm_replay_locks;
+        else
+                return NULL;
+}
+
+/* XXX move to proper place */
+char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
+{
+        switch(nal){
+        case TCPNAL:
+                /* userspace NAL */
+        case SOCKNAL:
+                sprintf(str, "%u:%d.%d.%d.%d", (__u32)(nid >> 32),
+                        HIPQUAD(nid));
+                break;
+        case QSWNAL:
+        case GMNAL:
+        case IBNAL:
+        case SCIMACNAL:
+                sprintf(str, "%u:%u", (__u32)(nid >> 32), (__u32)nid);
+                break;
+        default:
+                return NULL;
+        }
+        return str;
+}
+
+ptl_handle_ni_t         tcpnal_ni;
+
+struct pingcli_args {
+        ptl_nid_t mynid;
+        ptl_nid_t nid;
+       ptl_pid_t port;
+        int count;
+        int size;
+};
+
+struct task_struct *current;
+
+/* portals interfaces */
+ptl_handle_ni_t *
+kportal_get_ni (int nal)
+{
+        switch (nal)
+        {
+        case SOCKNAL:
+                return &tcpnal_ni;
+        default:
+                return NULL;
+        }
+}
+
+inline void
+kportal_put_ni (int nal)
+{
+        return;
+}
+
+int
+kportal_nal_cmd(struct portals_cfg *pcfg)
+{
+#if 0
+        __u32 nal = pcfg->pcfg_nal;
+        int rc = -EINVAL;
+
+        ENTRY;
+
+        down(&nal_cmd_sem);
+        if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) {
+                CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, 
+                       pcfg->pcfg_command);
+                rc = nal_cmd[nal].nch_handler(pcfg, nal_cmd[nal].nch_private);
+        }
+        up(&nal_cmd_sem);
+        RETURN(rc);
+#else
+        CERROR("empty function!!!\n");
+        return 0;
+#endif
+}
+
+int init_current(int argc, char **argv)
+{ 
+        current = malloc(sizeof(*current));
+        strncpy(current->comm, argv[0], sizeof(current->comm));
+        current->pid = getpid();
+       return 0;
+}
+
+ptl_nid_t tcpnal_mynid;
+
+int init_lib_portals()
+{
+        int rc;
+
+        PtlInit();
+        rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni);
+        if (rc != 0) {
+                CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
+                PtlFini();
+                RETURN (rc);
+        }
+        PtlNIDebug(tcpnal_ni, ~0);
+        return rc;
+}
+
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+
+int liblustre_ioctl(int dev_id, int opc, void *ptr)
+{
+       int   rc = -EINVAL;
+       
+       switch (dev_id) {
+       default:
+               fprintf(stderr, "Unexpected device id %d\n", dev_id);
+               abort();
+               break;
+               
+       case OBD_DEV_ID:
+               rc = class_handle_ioctl(opc, (unsigned long)ptr);
+               break;
+       }
+
+       return rc;
+}
+
+static void generate_random_uuid(unsigned char uuid_out[16])
+{
+        int *arr = (int*)uuid_out;
+        int i;
+
+        for (i = 0; i < sizeof(uuid_out)/sizeof(int); i++)
+                arr[i] = rand();
+}
+
+static char *echo_server_nid = NULL;
+static char *echo_server_ostname = "obd1";
+static char *osc_dev_name = "OSC_DEV_NAME";
+static char *echo_dev_name = "ECHO_CLIENT_DEV_NAME";
+
+static int connect_echo_client(void)
+{
+       struct lustre_cfg lcfg;
+       ptl_nid_t nid;
+       char *peer = "ECHO_PEER_NID";
+       class_uuid_t osc_uuid, echo_uuid;
+       struct obd_uuid osc_uuid_str, echo_uuid_str;
+       int nal, err;
+       ENTRY;
+
+        generate_random_uuid(osc_uuid);
+        class_uuid_unparse(osc_uuid, &osc_uuid_str);
+        generate_random_uuid(echo_uuid);
+        class_uuid_unparse(echo_uuid, &echo_uuid_str);
+
+        if (ptl_parse_nid(&nid, echo_server_nid)) {
+                CERROR("Can't parse NID %s\n", echo_server_nid);
+                RETURN(-EINVAL);
+        }
+        nal = ptl_name2nal("tcp");
+        if (nal <= 0) {
+                CERROR("Can't parse NAL tcp\n");
+                RETURN(-EINVAL);
+        }
+
+       /* add uuid */
+        LCFG_INIT(lcfg, LCFG_ADD_UUID, NULL);
+        lcfg.lcfg_nid = nid;
+        lcfg.lcfg_inllen1 = strlen(peer) + 1;
+        lcfg.lcfg_inlbuf1 = peer;
+        lcfg.lcfg_nal = nal;
+        err = class_process_config(&lcfg);
+        if (err < 0) {
+               CERROR("failed add_uuid\n");
+                RETURN(-EINVAL);
+       }
+
+       /* attach osc */
+        LCFG_INIT(lcfg, LCFG_ATTACH, osc_dev_name);
+        lcfg.lcfg_inlbuf1 = "osc";
+        lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
+        lcfg.lcfg_inlbuf2 = osc_uuid_str.uuid;
+        lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
+        err = class_process_config(&lcfg);
+        if (err < 0) {
+               CERROR("failed attach osc\n");
+                RETURN(-EINVAL);
+       }
+
+       /* setup osc */
+        LCFG_INIT(lcfg, LCFG_SETUP, osc_dev_name);
+        lcfg.lcfg_inlbuf1 = echo_server_ostname;
+        lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
+        lcfg.lcfg_inlbuf2 = peer;
+        lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
+        err = class_process_config(&lcfg);
+        if (err < 0) {
+               CERROR("failed setup osc\n");
+                RETURN(-EINVAL);
+       }
+
+       /* attach echo_client */
+        LCFG_INIT(lcfg, LCFG_ATTACH, echo_dev_name);
+        lcfg.lcfg_inlbuf1 = "echo_client";
+        lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
+        lcfg.lcfg_inlbuf2 = echo_uuid_str.uuid;
+        lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
+        err = class_process_config(&lcfg);
+        if (err < 0) {
+               CERROR("failed attach echo_client\n");
+                RETURN(-EINVAL);
+       }
+
+       /* setup echo_client */
+        LCFG_INIT(lcfg, LCFG_SETUP, echo_dev_name);
+        lcfg.lcfg_inlbuf1 = osc_dev_name;
+        lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
+        lcfg.lcfg_inlbuf2 = NULL;
+        lcfg.lcfg_inllen2 = 0;
+        err = class_process_config(&lcfg);
+        if (err < 0) {
+               CERROR("failed setup echo_client\n");
+                RETURN(-EINVAL);
+       }
+
+       RETURN(0);
+}
+
+static int disconnect_echo_client(void)
+{
+       struct lustre_cfg lcfg;
+       int err;
+       ENTRY;
+
+       /* cleanup echo_client */
+        LCFG_INIT(lcfg, LCFG_CLEANUP, echo_dev_name);
+        err = class_process_config(&lcfg);
+        if (err < 0) {
+               CERROR("failed cleanup echo_client\n");
+                RETURN(-EINVAL);
+       }
+
+       /* detach echo_client */
+        LCFG_INIT(lcfg, LCFG_DETACH, echo_dev_name);
+        err = class_process_config(&lcfg);
+        if (err < 0) {
+               CERROR("failed detach echo_client\n");
+                RETURN(-EINVAL);
+       }
+
+       /* cleanup osc */
+        LCFG_INIT(lcfg, LCFG_CLEANUP, osc_dev_name);
+        err = class_process_config(&lcfg);
+        if (err < 0) {
+               CERROR("failed cleanup osc device\n");
+                RETURN(-EINVAL);
+       }
+
+       /* detach osc */
+        LCFG_INIT(lcfg, LCFG_DETACH, osc_dev_name);
+        err = class_process_config(&lcfg);
+        if (err < 0) {
+               CERROR("failed detach osc device\n");
+                RETURN(-EINVAL);
+       }
+
+       RETURN(0);
+}
+
+static void usage(const char *s)
+{
+       printf("Usage: %s -s ost_host_name [-n ost_name]\n", s);
+       printf("    ost_host_name: the host name of echo server\n");
+       printf("    ost_name: ost name, default is \"obd1\"\n");
+}
+
+extern int time_ptlwait1;
+extern int time_ptlwait2;
+extern int time_ptlselect;
+
+int main(int argc, char **argv) 
+{
+       int c, rc;
+
+       while ((c = getopt(argc, argv, "s:n:")) != -1) {
+               switch (c) {
+               case 's':
+                       echo_server_nid = optarg;
+                       break;
+               case 'n':
+                       echo_server_ostname = optarg;
+                       break;
+               default:
+                       usage(argv[0]);
+                       return 1;
+               }
+       }
+
+        if (optind != argc)
+                usage(argv[0]);
+
+       if (!echo_server_nid) {
+               usage(argv[0]);
+               return 1;
+       }
+
+        srand(time(NULL));
+
+       tcpnal_mynid = rand();
+#if 1
+       portal_debug = 0;
+       portal_subsystem_debug = 0;
+#endif
+
+        if (init_current(argc, argv) ||
+           init_obdclass() || init_lib_portals() ||
+           ptlrpc_init() ||
+           ldlm_init() ||
+           mdc_init() ||
+           lov_init() ||
+           osc_init() ||
+           echo_client_init()) {
+               printf("error\n");
+               return 1;
+       }
+
+       rc = connect_echo_client();
+       if (rc)
+               return rc;
+
+       set_ioc_handler(liblustre_ioctl);
+
+       rc = lctl_main(1, &argv[0]);
+
+       rc |= disconnect_echo_client();
+
+       return rc;
+}
similarity index 99%
rename from lustre/liblustre/recovery_small.c
rename to lustre/liblustre/tests/recovery_small.c
index b1292c2..5aed06c 100644 (file)
@@ -353,6 +353,8 @@ int main(int argc, char * argv[])
                 exit(-1);
         }
 
+        setenv(ENV_LUSTRE_TIMEOUT, "10", 1);
+
         __liblustre_setup_();
 
         while (drop_arr[drop_index].name) {
diff --git a/lustre/liblustre/tests/replay_ost_single.c b/lustre/liblustre/tests/replay_ost_single.c
new file mode 100644 (file)
index 0000000..2897807
--- /dev/null
@@ -0,0 +1,338 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light user test program
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define _BSD_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/queue.h>
+#include <signal.h>
+
+#include <sysio.h>
+#include <mount.h>
+
+#include "test_common.h"
+
+
+
+static char mds_server[1024] = {0,};
+static char barrier_script[1024] = {0,};
+static char failover_script[1024] = {0,};
+static char barrier_cmd[1024] = {0,};
+static char failover_cmd[1024] = {0,};
+
+static void replay_barrier()
+{
+        int rc;
+
+        if ((rc = system(barrier_cmd))) {
+                printf("excute barrier error: %d\n", rc);
+                exit(rc);
+        }
+}
+
+static void mds_failover()
+{
+        int rc;
+
+        if ((rc = system(failover_cmd))) {
+                printf("excute failover error: %d\n", rc);
+                exit(rc);
+        }
+}
+
+
+#define ENTRY(str)                                                      \
+        do {                                                            \
+                char buf[100];                                          \
+                int len;                                                \
+                sprintf(buf, "===== START: %s ", (str));                \
+                len = strlen(buf);                                      \
+                if (len < 79) {                                         \
+                        memset(buf+len, '=', 100-len);                  \
+                        buf[79] = '\n';                                 \
+                        buf[80] = 0;                                    \
+                }                                                       \
+                printf("%s", buf);                                      \
+        } while (0)
+
+#define LEAVE()                                                         \
+        do {                                                            \
+                printf("----- END TEST successfully ---");              \
+                printf("-----------------------------");                \
+                printf("-------------------\n");                        \
+        } while (0)
+
+void t0()
+{
+        const int bufsize = 4096;
+        char *path = "/mnt/lustre/rp_ost_t0_file";
+        char buf[bufsize];
+        int fd, i, j, rc;
+        ENTRY("open-failover-write-verification (no ping involved)");
+
+        printf("create/open file...\n");
+        t_touch(path);
+        fd = t_open(path);
+        printf("OST failover...\n");
+        replay_barrier();
+        mds_failover();
+
+        printf("write file...\n");
+        for (i = 0; i < 20; i++) {
+                memset(buf, i, bufsize);
+                if ((rc = write(fd, buf, bufsize)) != bufsize) {
+                        perror("write error after failover");
+                        printf("i = %d, rc = %d\n", i, rc);
+                        exit(-1);
+                }
+        }
+
+        /* verify */
+        printf("read & verify...\n");
+        lseek(fd, 0, SEEK_SET);
+        for (i = 0; i < 20; i++) {
+                memset(buf, -1, bufsize);
+                if ((rc = read(fd, buf, bufsize)) != bufsize) {
+                        perror("read error rc");
+                        printf("i = %d, rc = %d\n", i, rc);
+                        exit(-1);
+                }
+                for (j = 0; j < bufsize; j++) {
+                        if (buf[j] != i) {
+                                printf("verify error!\n");
+                                exit(-1);
+                        }
+                }
+        }
+        t_close(fd);
+        t_unlink(path);
+        LEAVE();
+}
+
+void t1()
+{
+        const int bufsize = 4096;
+        char *path = "/mnt/lustre/rp_ost_t1_file";
+        char buf[bufsize];
+        int fd, i, j;
+        ENTRY("open-write-close-open-failover-read (no ping involved)");
+
+        printf("create/open file...\n");
+        t_touch(path);
+        fd = t_open(path);
+        printf("write file...\n");
+        for (i = 0; i < 20; i++) {
+                memset(buf, i, bufsize);
+                if (write(fd, buf, bufsize) != bufsize) {
+                        perror("write error");
+                        exit(-1);
+                }
+        }
+        printf("close/reopen...\n");
+        t_close(fd);
+        fd = t_open(path);
+        lseek(fd, 0, SEEK_SET);
+
+        printf("OST failover...\n");
+        replay_barrier();
+        mds_failover();
+
+        printf("read & verify...\n");
+        for (i = 0; i < 20; i++) {
+                memset(buf, -1, bufsize);
+                if (read(fd, buf, bufsize) != bufsize) {
+                        perror("read error after failover");
+                        exit(-1);
+                }
+                for (j = 0; j < bufsize; j++) {
+                        if (buf[j] != i) {
+                                printf("verify error after failover\n");
+                                exit(-1);
+                        }
+                }
+        }
+
+        t_close(fd);
+        t_unlink(path);
+        LEAVE();
+}
+
+void t2()
+{
+        char *path = "/mnt/lustre/rp_ost_t2_file";
+        char *str = "xxxxjoiwlsdf98lsjdfsjfoajflsjfajfoaidfojaj08eorje;";
+        ENTRY("empty replay");
+
+        replay_barrier();
+        mds_failover();
+
+        t_echo_create(path, str);
+        t_grep(path, str);
+        t_unlink(path);
+}
+
+void t3()
+{
+        char *path = "/mnt/lustre/rp_ost_t3_file";
+        char *str = "xxxxjoiwlsdf98lsjdfsjfoajflsjfajfoaidfojaj08eorje;";
+        ENTRY("touch");
+
+        printf("touch to create a file\n");
+        t_echo_create(path, str);
+        replay_barrier();
+        mds_failover();
+
+        printf("read & verify\n");
+        t_grep(path, str);
+        t_unlink(path);
+        /* XXX have problem without this, seems server side problem XXX */
+        sleep(5);
+}
+
+void t4()
+{
+        char *path = "/mnt/lustre/rp_ost_t4_file";
+        char namebuf[1024];
+        char str[1024];
+        int count = 10, i;
+        ENTRY("|X| 10 open(CREAT)s (ping involved)");
+
+        printf("create %d files\n", count);
+        for (i = 0; i < count; i++) {
+                sprintf(namebuf, "%s%02d", path, i);
+                sprintf(str, "%s-%08d-%08x-AAAAA", "content", i, i);
+                t_echo_create(namebuf, str);
+        }
+        replay_barrier();
+        mds_failover();
+
+        printf("read & verify\n");
+        for (i = 0; i < count; i++) {
+                sprintf(namebuf, "%s%02d", path, i);
+                sprintf(str, "%s-%08d-%08x-AAAAA", "content", i, i);
+                t_grep(namebuf, str);
+                t_unlink(namebuf);
+        }
+}
+
+extern int portal_debug;
+extern int portal_subsystem_debug;
+
+extern void __liblustre_setup_(void);
+extern void __liblustre_cleanup_(void);
+
+void usage(const char *cmd)
+{
+        printf("Usage: \t%s --target mdsnid:/mdsname/profile -s ost_hostname "
+                "-b \"barrier cmd\" -f \"failover cmd\"\n", cmd);
+        printf("       \t%s --dumpfile dumpfile -s ost_hostname -b \"barrier cmd\" "
+                "-f \"failover cmd\"\n", cmd);
+        exit(-1);
+}
+
+void test_ssh()
+{
+        char cmd[1024];
+
+        sprintf(cmd, "ssh %s cat /dev/null", mds_server);
+        if (system(cmd)) {
+                printf("ssh can't access server node: %s\n", mds_server);
+                exit(-1);
+        }
+}
+
+int main(int argc, char * const argv[])
+{
+        int opt_index, c;
+        static struct option long_opts[] = {
+                {"target", 1, 0, 0},
+                {"dumpfile", 1, 0, 0},
+                {0, 0, 0, 0}
+        };
+
+        if (argc < 4)
+                usage(argv[0]);
+
+        while ((c = getopt_long(argc, argv, "s:b:f:", long_opts, &opt_index)) != -1) {
+                switch (c) {
+                case 0: {
+                        if (!optarg[0])
+                                usage(argv[0]);
+
+                        if (!strcmp(long_opts[opt_index].name, "target")) {
+                                setenv(ENV_LUSTRE_MNTTGT, optarg, 1);
+                        } else if (!strcmp(long_opts[opt_index].name, "dumpfile")) {
+                                setenv(ENV_LUSTRE_DUMPFILE, optarg, 1);
+                        } else
+                                usage(argv[0]);
+                        break;
+                }
+                case 's':
+                        strcpy(mds_server, optarg);
+                        break;
+                case 'b':
+                        strcpy(barrier_script, optarg);
+                        break;
+                case 'f':
+                        strcpy(failover_script, optarg);
+                        break;
+                default:
+                        usage(argv[0]);
+                }
+        }
+
+        if (optind != argc)
+                usage(argv[0]);
+        if (!strlen(mds_server) || !strlen(barrier_script) ||
+            !strlen(failover_script))
+                usage(argv[0]);
+
+        test_ssh();
+
+        /* prepare remote command */
+        sprintf(barrier_cmd, "ssh %s \"%s\"", mds_server, barrier_script);
+        sprintf(failover_cmd, "ssh %s \"%s\"", mds_server, failover_script);
+
+        setenv(ENV_LUSTRE_TIMEOUT, "5", 1);
+
+        __liblustre_setup_();
+
+        t0();
+        t1();
+        t2();
+        t3();
+        t4();
+
+       printf("liblustre is about shutdown\n");
+        __liblustre_cleanup_();
+
+       printf("complete successfully\n");
+       return 0;
+}
old mode 100755 (executable)
new mode 100644 (file)
similarity index 99%
rename from lustre/liblustre/replay_single.c
rename to lustre/liblustre/tests/replay_single.c
index 1602a7c..6645056
@@ -384,6 +384,8 @@ int main(int argc, char * const argv[])
         sprintf(barrier_cmd, "ssh %s \"%s\"", mds_server, barrier_script);
         sprintf(failover_cmd, "ssh %s \"%s\"", mds_server, failover_script);
 
+        setenv(ENV_LUSTRE_TIMEOUT, "10", 1);
+
         __liblustre_setup_();
 
         t0();
similarity index 81%
rename from lustre/liblustre/lltest.c
rename to lustre/liblustre/tests/sanity.c
index ac6e8ad..391dd3d 100644 (file)
@@ -180,7 +180,7 @@ static void pages_io(int xfer, loff_t pos)
 void t5()
 {
         char text[256];
-        loff_t off_array[] = {1, 17, 255, 257, 4095, 4097, 8191, 1024*1024*1024};
+        loff_t off_array[] = {1, 4, 17, 255, 258, 4095, 4097, 8191, 1024*1024*1024};
         int np = 1, i;
         loff_t offset = 0;
 
@@ -280,7 +280,7 @@ void t10()
         LEAVE();
 }
 
-void t100()
+void t11()
 {
         char *base="/mnt/lustre";
         char path[4096], path2[4096];
@@ -316,6 +316,91 @@ void t100()
         LEAVE();
 }
 
+void t12()
+{
+        char *dir="/mnt/lustre/test_t12_dir";
+        char buf[1024*128];
+        int fd;
+        ENTRY("empty directory readdir");
+
+        t_mkdir(dir);
+        fd = t_open(dir);
+        t_ls(fd, buf, sizeof(buf));
+        t_close(fd);
+        t_rmdir(dir);
+        LEAVE();
+}
+
+void t13()
+{
+        char *dir="/mnt/lustre/test_t13_dir/";
+        char name[1024];
+        char buf[1024];
+        const int nfiles = 20;
+        char *prefix = "test13_filename_prefix_";
+        int fd, i;
+        ENTRY("multiple entries directory readdir");
+
+        t_mkdir(dir);
+        printf("Creating %d files...\n", nfiles);
+        for (i = 0; i < nfiles; i++) {
+                sprintf(name, "%s%s%05d", dir, prefix, i);
+                t_touch(name);
+        }
+        fd = t_open(dir);
+        t_ls(fd, buf, sizeof(buf));
+        t_close(fd);
+        printf("Cleanup...\n");
+        for (i = 0; i < nfiles; i++) {
+                sprintf(name, "%s%s%05d", dir, prefix, i);
+                t_unlink(name);
+        }
+        t_rmdir(dir);
+        LEAVE();
+}
+
+void t14()
+{
+        char *dir="/mnt/lustre/test_t14_dir/";
+        char name[1024];
+        char buf[1024];
+        const int nfiles = 256;
+        char *prefix = "test14_filename_long_prefix_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA___";
+        int fd, i;
+        ENTRY(">1 block(4k) directory readdir");
+
+        t_mkdir(dir);
+        printf("Creating %d files...\n", nfiles);
+        for (i = 0; i < nfiles; i++) {
+                sprintf(name, "%s%s%05d", dir, prefix, i);
+                t_touch(name);
+        }
+        fd = t_open(dir);
+        t_ls(fd, buf, sizeof(buf));
+        t_close(fd);
+        printf("Cleanup...\n");
+        for (i = 0; i < nfiles; i++) {
+                sprintf(name, "%s%s%05d", dir, prefix, i);
+                t_unlink(name);
+        }
+        t_rmdir(dir);
+        LEAVE();
+}
+
+void t15()
+{
+        char *file = "/mnt/lustre/test_t15_file";
+        int fd;
+        ENTRY("open-stat-close");
+
+        t_touch(file);
+        fd = t_open(file);
+        t_check_stat(file, NULL);
+        t_close(fd);
+        t_unlink(file);
+        LEAVE();
+}
+
 extern void __liblustre_setup_(void);
 extern void __liblustre_cleanup_(void);
 
@@ -341,7 +426,6 @@ int main(int argc, char * const argv[])
         while ((c = getopt_long(argc, argv, "", long_opts, &opt_index)) != -1) {
                 switch (c) {
                 case 0: {
-                        printf("optindex %d\n", opt_index);
                         if (!optarg[0])
                                 usage(argv[0]);
 
@@ -374,8 +458,11 @@ int main(int argc, char * const argv[])
         t8();
         t9();
         t10();
-
-        t100();
+        t11();
+        t12();
+        t13();
+        t14();
+        t15();
 #endif
 
        printf("liblustre is about shutdown\n");
similarity index 91%
rename from lustre/liblustre/test_common.c
rename to lustre/liblustre/tests/test_common.c
index 210d57e..6f6676e 100644 (file)
@@ -6,6 +6,7 @@
 #include <fcntl.h>
 #include <string.h>
 #include <errno.h>
+#include <dirent.h>
 
 #include "test_common.h"
 
@@ -243,7 +244,7 @@ void t_echo_create(const char *path, const char *str)
         }
 }
 
-void _t_grep(const char *path, char *str, int should_contain)
+static void _t_grep(const char *path, char *str, int should_contain)
 {
        char buf[1024];
        int fd;
@@ -278,3 +279,25 @@ void t_grep_v(const char *path, char *str)
 {
        _t_grep(path, str, 0);
 }
+
+void t_ls(int fd, char *buf, int size)
+{
+       struct dirent64 *ent;
+       int rc, pos;
+       loff_t base = 0;
+
+       printf("dir entries listing...\n");
+       while ((rc = getdirentries64(fd, buf, size, &base)) > 0) {
+               pos = 0;
+               while (pos < rc) {
+                       ent = (struct dirent64 *) ((char*) buf + pos);
+                       printf("%s\n", ent->d_name);
+                       pos += ent->d_reclen;
+               }
+       }
+
+       if (rc < 0) {
+               printf("getdents error %d\n", rc);
+               EXIT(-1);
+       }
+}
similarity index 91%
rename from lustre/liblustre/test_common.h
rename to lustre/liblustre/tests/test_common.h
index af638f2..9d537cc 100644 (file)
@@ -3,6 +3,7 @@
 
 #define ENV_LUSTRE_MNTPNT               "LIBLUSTRE_MOUNT_POINT"
 #define ENV_LUSTRE_MNTTGT               "LIBLUSTRE_MOUNT_TARGET"
+#define ENV_LUSTRE_TIMEOUT              "LIBLUSTRE_TIMEOUT"
 #define ENV_LUSTRE_DUMPFILE             "LIBLUSTRE_DUMPFILE"
 
 extern int exit_on_err;
@@ -24,8 +25,8 @@ void t_close(int fd);
 int t_check_stat(const char *name, struct stat *buf);
 int t_check_stat_fail(const char *name);
 void t_echo_create(const char *path, const char *str);
-//int t_pread_once(const char *path, char *buf, size_t size, off_t offset);
 void t_grep(const char *path, char *str);
 void t_grep_v(const char *path, char *str);
+void t_ls(int fd, char *buf, int size);
 
 #endif
index 21555c2..e5801a0 100644 (file)
@@ -542,7 +542,7 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent)
                 cfg.cfg_instance = sbi->ll_instance;
                 cfg.cfg_uuid = sbi->ll_sb_uuid;
                 cfg.cfg_local_nid = lmd->lmd_local_nid;
-                err = lustre_process_log(lmd, lmd->lmd_profile, &cfg, 1);
+                err = lustre_process_log(lmd, lmd->lmd_profile, &cfg, 0);
                 if (err < 0) {
                         CERROR("Unable to process log: %s\n", lmd->lmd_profile);
 
index d6d35d8..468d064 100644 (file)
@@ -6,8 +6,9 @@
 DEFS=
 
 if LIBLUSTRE
-lib_LIBRARIES = liblov.a
+noinst_LIBRARIES = liblov.a
 liblov_a_SOURCES = lov_log.c lov_obd.c lov_pack.c lov_internal.h
+liblov_a_CFLAGS = -fPIC
 else
 MODULE = lov
 modulefs_DATA = lov.o
index 3006306..1fd7dd1 100644 (file)
@@ -14,8 +14,9 @@ endif
 
 
 if LIBLUSTRE
-lib_LIBRARIES = liblvfs.a
+noinst_LIBRARIES = liblvfs.a
 liblvfs_a_SOURCES = lvfs_userfs.c
+liblvfs_a_CFLAGS = -fPIC
 
 #if MYSQL
 #liblvfs_a_SOURCES += lvfs_user_mysql.c
index c254e76..0200532 100644 (file)
@@ -6,8 +6,9 @@
 DEFS=
 
 if LIBLUSTRE
-lib_LIBRARIES = libmdc.a
+noinst_LIBRARIES = libmdc.a
 libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c
+libmdc_a_CFLAGS = -fPIC
 else
 MODULE = mdc
 modulefs_DATA = mdc.o
index bcac2e3..8ceb655 100644 (file)
@@ -550,14 +550,12 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *mdc_fid, __u64 offset,
         /* XXX FIXME bug 249 */
         req->rq_request_portal = MDS_READPAGE_PORTAL;
 
-        desc = ptlrpc_prep_bulk_imp(req, BULK_PUT_SINK, MDS_BULK_PORTAL);
+        desc = ptlrpc_prep_bulk_imp(req, 1, BULK_PUT_SINK, MDS_BULK_PORTAL);
         if (desc == NULL)
                 GOTO(out, rc = -ENOMEM);
         /* NB req now owns desc and will free it when it gets freed */
 
-        rc = ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE);
-        if (rc != 0)
-                GOTO(out, rc);
+        ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE);
 
         mdc_readdir_pack(req, offset, PAGE_CACHE_SIZE, mdc_fid);
 
@@ -565,13 +563,20 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *mdc_fid, __u64 offset,
         rc = ptlrpc_queue_wait(req);
 
         if (rc == 0) {
-                LASSERT(desc->bd_page_count == 1);
-                body = lustre_swab_repbuf(req, 0, sizeof(*body),
+                body = lustre_swab_repbuf(req, 0, sizeof (*body),
                                           lustre_swab_mds_body);
                 if (body == NULL) {
                         CERROR("Can't unpack mds_body\n");
                         GOTO(out, rc = -EPROTO);
                 }
+
+                if (req->rq_bulk->bd_nob_transferred != PAGE_CACHE_SIZE) {
+                        CERROR ("Unexpected # bytes transferred: %d"
+                                " (%ld expected)\n",
+                                req->rq_bulk->bd_nob_transferred,
+                                PAGE_CACHE_SIZE);
+                        GOTO (out, rc = -EPROTO);
+                }
         }
 
         EXIT;
index c512293..fbb3a3c 100644 (file)
 static int mds_postsetup(struct obd_device *obd);
 static int mds_cleanup(struct obd_device *obd, int flags);
 
-static int mds_bulk_timeout(void *data)
-{
-        struct ptlrpc_bulk_desc *desc = data;
-        struct obd_export *exp = desc->bd_export;
-
-        DEBUG_REQ(D_ERROR, desc->bd_req,"bulk send timed out: evicting %s@%s\n",
-                  exp->exp_client_uuid.uuid,
-                  exp->exp_connection->c_remote_uuid.uuid);
-        ptlrpc_fail_export(exp);
-        ptlrpc_abort_bulk (desc);
-        RETURN(1);
-}
-
 /* Assumes caller has already pushed into the kernel filesystem context */
 static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
                         loff_t offset, int count)
@@ -89,7 +76,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
         if (!pages)
                 GOTO(out, rc = -ENOMEM);
 
-        desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, MDS_BULK_PORTAL);
+        desc = ptlrpc_prep_bulk_exp (req, 1, BULK_PUT_SOURCE, MDS_BULK_PORTAL);
         if (desc == NULL)
                 GOTO(out_free, rc = -ENOMEM);
 
@@ -100,9 +87,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
                 if (pages[i] == NULL)
                         GOTO(cleanup_buf, rc = -ENOMEM);
 
-                rc = ptlrpc_prep_bulk_page(desc, pages[i], 0, tmpsize);
-                if (rc != 0)
-                        GOTO(cleanup_buf, rc);
+                ptlrpc_prep_bulk_page(desc, pages[i], 0, tmpsize);
         }
 
         for (i = 0, tmpcount = count; i < npages; i++, tmpcount -= tmpsize) {
@@ -118,25 +103,41 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
                         GOTO(cleanup_buf, rc = -EIO);
         }
 
-        rc = ptlrpc_bulk_put(desc);
+        LASSERT(desc->bd_nob == count);
+
+        rc = ptlrpc_start_bulk_transfer(desc);
         if (rc)
                 GOTO(cleanup_buf, rc);
 
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
                 CERROR("obd_fail_loc=%x, fail operation rc=%d\n",
                        OBD_FAIL_MDS_SENDPAGE, rc);
-                ptlrpc_abort_bulk(desc);
-                GOTO(cleanup_buf, rc);
+                GOTO(abort_bulk, rc);
         }
 
-        lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, mds_bulk_timeout, desc);
-        rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete (desc), &lwi);
-        if (rc) {
-                LASSERT (rc == -ETIMEDOUT);
-                GOTO(cleanup_buf, rc);
+        lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+        rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi);
+        LASSERT (rc == 0 || rc == -ETIMEDOUT);
+
+        if (rc == 0) {
+                if (desc->bd_success &&
+                    desc->bd_nob_transferred == count)
+                        GOTO(cleanup_buf, rc);
+
+                rc = -ETIMEDOUT; /* XXX should this be a different errno? */
         }
+        
+        DEBUG_REQ(D_ERROR, req, "bulk failed: %s %d(%d), evicting %s@%s\n",
+                  (rc == -ETIMEDOUT) ? "timeout" : "network error",
+                  desc->bd_nob_transferred, count,
+                  req->rq_export->exp_client_uuid.uuid,
+                  req->rq_export->exp_connection->c_remote_uuid.uuid);
+
+        ptlrpc_fail_export(req->rq_export);
 
         EXIT;
+ abort_bulk:
+        ptlrpc_abort_bulk (desc);
  cleanup_buf:
         for (i = 0; i < npages; i++)
                 if (pages[i])
@@ -358,6 +359,21 @@ static int mds_disconnect(struct obd_export *export, int flags)
 
         ldlm_cancel_locks_for_export(export);
 
+        /* complete all outstanding replies */
+        spin_lock_irqsave (&export->exp_lock, irqflags);
+        while (!list_empty (&export->exp_outstanding_replies)) {
+                struct ptlrpc_reply_state *rs =
+                        list_entry (export->exp_outstanding_replies.next, 
+                                    struct ptlrpc_reply_state, rs_exp_list);
+                struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
+
+                spin_lock (&svc->srv_lock);
+                list_del_init (&rs->rs_exp_list);
+                ptlrpc_schedule_difficult_reply (rs);
+                spin_unlock (&svc->srv_lock);
+        }
+        spin_unlock_irqrestore (&export->exp_lock, irqflags);
+
         spin_lock_irqsave(&export->exp_lock, irqflags);
         export->exp_flags = flags;
         spin_unlock_irqrestore(&export->exp_lock, irqflags);
@@ -1100,7 +1116,13 @@ int mds_handle(struct ptlrpc_request *req)
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_READPAGE_NET, 0);
                 rc = mds_readpage(req);
 
-                OBD_FAIL_RETURN(OBD_FAIL_MDS_SENDPAGE, 0);
+                if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_SENDPAGE)) {
+                        if (req->rq_reply_state) {
+                                lustre_free_reply_state (req->rq_reply_state);
+                                req->rq_reply_state = NULL;
+                        }
+                        RETURN(0);
+                }
 
                 break;
 
@@ -1789,11 +1811,11 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
         int rc = 0;
         ENTRY;
 
-        mds->mds_service = ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
-                                           MDS_BUFSIZE, MDS_MAXREQSIZE,
-                                           MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL,
-                                           mds_handle, "mds", 
-                                           obddev->obd_proc_entry);
+        mds->mds_service = 
+                ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
+                                MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL,
+                                mds_handle, "mds",
+                                obddev->obd_proc_entry);
 
         if (!mds->mds_service) {
                 CERROR("failed to start service\n");
@@ -1806,8 +1828,7 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
                 GOTO(err_thread, rc);
 
         mds->mds_setattr_service =
-                ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
-                                MDS_BUFSIZE, MDS_MAXREQSIZE,
+                ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
                                 MDS_SETATTR_PORTAL, MDC_REPLY_PORTAL,
                                 mds_handle, "mds_setattr", 
                                 obddev->obd_proc_entry);
@@ -1822,8 +1843,7 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
                 GOTO(err_thread2, rc);
                         
         mds->mds_readpage_service =
-                ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
-                                MDS_BUFSIZE, MDS_MAXREQSIZE,
+                ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
                                 MDS_READPAGE_PORTAL, MDC_REPLY_PORTAL,
                                 mds_handle, "mds_readpage", 
                                 obddev->obd_proc_entry);
index 48d739c..a6bba27 100644 (file)
@@ -52,7 +52,8 @@ int mds_cleanup_orphans(struct obd_device *obd);
 
 /* mds/mds_log.c */
 int mds_log_op_unlink(struct obd_device *obd, struct inode *inode, 
-                      struct lustre_msg *repmsg, int offset);
+                      struct lov_mds_md *lmm, int lmm_size,
+                      struct llog_cookie *logcookies, int cookies_size);
 int mds_llog_init(struct obd_device *obd, struct obd_device *tgt, int count, 
                   struct llog_logid *logid);
 int mds_llog_finish(struct obd_device *obd, int count);
index 549c760..c4d5690 100644 (file)
@@ -82,7 +82,8 @@ static int mds_llog_repl_cancel(struct llog_ctxt *ctxt, struct lov_stripe_md *ls
 }
 
 int mds_log_op_unlink(struct obd_device *obd, struct inode *inode,
-                      struct lustre_msg *repmsg, int offset)
+                      struct lov_mds_md *lmm, int lmm_size,
+                      struct llog_cookie *logcookies, int cookies_size)
 {
         struct mds_obd *mds = &obd->u.mds;
         struct lov_stripe_md *lsm = NULL;
@@ -94,14 +95,13 @@ int mds_log_op_unlink(struct obd_device *obd, struct inode *inode,
                 RETURN(PTR_ERR(mds->mds_osc_obd));
 
         rc = obd_unpackmd(mds->mds_osc_exp, &lsm,
-                          lustre_msg_buf(repmsg, offset, 0),
-                          repmsg->buflens[offset]);
+                          lmm, lmm_size);
         if (rc < 0)
                 RETURN(rc);
 
         ctxt = llog_get_context(obd, LLOG_UNLINK_ORIG_CTXT);
-        rc = llog_add(ctxt, NULL, lsm, lustre_msg_buf(repmsg, offset + 1, 0),
-                      repmsg->buflens[offset + 1] / sizeof(struct llog_cookie));
+        rc = llog_add(ctxt, NULL, lsm, logcookies,
+                      cookies_size / sizeof(struct llog_cookie));
 
         obd_free_memmd(mds->mds_osc_exp, &lsm);
 
@@ -123,7 +123,7 @@ int mds_llog_init(struct obd_device *obd, struct obd_device *tgt,
         struct obd_device *lov_obd = obd->u.mds.mds_osc_obd;
         int rc;
         ENTRY;
-        
+
         rc = llog_setup(obd, LLOG_UNLINK_ORIG_CTXT, tgt, 0, NULL,
                         &mds_unlink_orig_logops);
         if (rc)
@@ -134,9 +134,9 @@ int mds_llog_init(struct obd_device *obd, struct obd_device *tgt,
         if (rc)
                 RETURN(rc);
 
-        rc = obd_llog_init(lov_obd, tgt, count, logid);        
-        if (rc) 
-                CERROR("error lov_llog_init\n"); 
+        rc = obd_llog_init(lov_obd, tgt, count, logid);
+        if (rc)
+                CERROR("error lov_llog_init\n");
 
         RETURN(rc);
 }
@@ -146,7 +146,7 @@ int mds_llog_finish(struct obd_device *obd, int count)
         struct obd_device *lov_obd = obd->u.mds.mds_osc_obd;
         int rc;
         ENTRY;
-        
+
         rc = llog_cleanup(llog_get_context(obd, LLOG_UNLINK_ORIG_CTXT));
         if (rc)
                 RETURN(rc);
@@ -155,9 +155,9 @@ int mds_llog_finish(struct obd_device *obd, int count)
         if (rc)
                 RETURN(rc);
 
-        rc = obd_llog_finish(lov_obd, count);        
-        if (rc) 
-                CERROR("error lov_llog_finish\n"); 
+        rc = obd_llog_finish(lov_obd, count);
+        if (rc)
+                CERROR("error lov_llog_finish\n");
 
         RETURN(rc);
 }
index 80728da..88724c0 100644 (file)
@@ -455,7 +455,6 @@ static void reconstruct_open(struct mds_update_record *rec, int offset,
                              struct ptlrpc_request *req,
                              struct lustre_handle *child_lockh)
 {
-        struct ptlrpc_request *oldreq = req->rq_export->exp_outstanding_reply;
         struct mds_export_data *med = &req->rq_export->exp_mds_data;
         struct mds_client_data *mcd = med->med_mcd;
         struct mds_obd *mds = mds_req2mds(req);
@@ -553,10 +552,11 @@ static void reconstruct_open(struct mds_update_record *rec, int offset,
                 mfd = NULL;
         }
 
-        if (oldreq != NULL) {
-                /* if we're not recovering, it had better be found */
-                LASSERT(mfd != NULL);
-        } else if (mfd == NULL) {
+#warning "XXX fixme"
+        /* Here it used to LASSERT(mfd) if exp_outstanding_reply != NULL.
+         * Now that exp_outstanding_reply is a list, it's just using mfd != NULL
+         * to detect a re-open */
+        if (mfd == NULL) {
                 mntget(mds->mds_vfsmnt);
                 CERROR("Re-opened file \n");
                 mfd = mds_dentry_open(child, mds->mds_vfsmnt,
@@ -969,7 +969,7 @@ int mds_open(struct mds_update_record *rec, int offset,
                 if (rc)
                         ldlm_lock_decref(&parent_lockh, parent_mode);
                 else
-                        ldlm_put_lock_into_req(req, &parent_lockh, parent_mode);
+                        ptlrpc_save_lock (req, &parent_lockh, parent_mode);
         }
         if (rc == 0)
                 atomic_inc(&mds->mds_open_count);
@@ -1048,7 +1048,10 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
                 if (req != NULL &&
                     (reply_body->valid & OBD_MD_FLEASIZE) &&
                     mds_log_op_unlink(obd, pending_child->d_inode,
-                                      req->rq_repmsg, 1) > 0) {
+                                lustre_msg_buf(req->rq_repmsg, 1, 0),
+                                req->rq_repmsg->buflens[1],
+                                lustre_msg_buf(req->rq_repmsg, 2, 0),
+                                req->rq_repmsg->buflens[2]) > 0) {
                         reply_body->valid |= OBD_MD_FLCOOKIE;
                 }
 
index dcacdcf..b44dc22 100644 (file)
@@ -270,21 +270,53 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec)
         RETURN(0);
 }
 
-void mds_steal_ack_locks(struct obd_export *exp, struct ptlrpc_request *req)
+void mds_steal_ack_locks(struct ptlrpc_request *req)
 {
-        unsigned long flags;
-        struct ptlrpc_request *oldrep = exp->exp_outstanding_reply;
-
-        if (oldrep == NULL)
+        struct obd_export         *exp = req->rq_export;
+        struct list_head          *tmp;
+        struct ptlrpc_reply_state *oldrep;
+        struct ptlrpc_service     *svc;
+        unsigned long              flags;
+        int                        i;
+
+        /* CAVEAT EMPTOR: spinlock order */
+        spin_lock_irqsave (&exp->exp_lock, flags);
+        list_for_each (tmp, &exp->exp_outstanding_replies) {
+                oldrep = list_entry(tmp, struct ptlrpc_reply_state,rs_exp_list);
+
+                if (oldrep->rs_xid != req->rq_xid)
+                        continue;
+
+                if (oldrep->rs_msg.opc != req->rq_reqmsg->opc)
+                        CERROR ("Resent req xid "LPX64" has mismatched opc: "
+                                "new %d old %d\n", req->rq_xid,
+                                req->rq_reqmsg->opc, oldrep->rs_msg.opc);
+
+                svc = oldrep->rs_srv_ni->sni_service;
+                spin_lock (&svc->srv_lock);
+
+                list_del_init (&oldrep->rs_exp_list);
+
+                CWARN("Stealing %d locks from rs %p x"LPD64".t"LPD64
+                      " o%d NID"LPX64"\n",
+                      oldrep->rs_nlocks, oldrep, 
+                      oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg.opc,
+                      exp->exp_connection->c_peer.peer_nid);
+
+                for (i = 0; i < oldrep->rs_nlocks; i++)
+                        ptlrpc_save_lock(req, 
+                                         &oldrep->rs_locks[i],
+                                         oldrep->rs_modes[i]);
+                oldrep->rs_nlocks = 0;
+
+                DEBUG_REQ(D_HA, req, "stole locks for");
+                ptlrpc_schedule_difficult_reply (oldrep);
+
+                spin_unlock (&svc->srv_lock);
+                spin_unlock_irqrestore (&exp->exp_lock, flags);
                 return;
-        memcpy(req->rq_ack_locks, oldrep->rq_ack_locks,
-               sizeof req->rq_ack_locks);
-        spin_lock_irqsave(&req->rq_lock, flags);
-        oldrep->rq_resent = 1;
-        wake_up(&oldrep->rq_reply_waitq);
-        spin_unlock_irqrestore(&req->rq_lock, flags);
-        DEBUG_REQ(D_HA, oldrep, "stole locks from");
-        DEBUG_REQ(D_HA, req, "stole locks for");
+        }
+        spin_unlock_irqrestore (&exp->exp_lock, flags);
 }
 
 void mds_req_from_mcd(struct ptlrpc_request *req, struct mds_client_data *mcd)
@@ -294,8 +326,7 @@ void mds_req_from_mcd(struct ptlrpc_request *req, struct mds_client_data *mcd)
         req->rq_repmsg->transno = req->rq_transno = mcd->mcd_last_transno;
         req->rq_repmsg->status = req->rq_status = mcd->mcd_last_result;
 
-        if (req->rq_export->exp_outstanding_reply)
-                mds_steal_ack_locks(req->rq_export, req);
+        mds_steal_ack_locks(req);
 }
 
 static void reconstruct_reint_setattr(struct mds_update_record *rec,
@@ -444,7 +475,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
                         if (rc) {
                                 ldlm_lock_decref(&lockh, LCK_PW);
                         } else {
-                                ldlm_put_lock_into_req(req, &lockh, LCK_PW);
+                                ptlrpc_save_lock (req, &lockh, LCK_PW);
                         }
                 }
         case 0:
@@ -695,7 +726,7 @@ cleanup:
                 if (rc) {
                         ldlm_lock_decref(&lockh, LCK_PW);
                 } else {
-                        ldlm_put_lock_into_req(req, &lockh, LCK_PW);
+                        ptlrpc_save_lock (req, &lockh, LCK_PW);
                 }
                 l_dput(dparent);
         case 0:
@@ -1181,8 +1212,11 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
                 rc = vfs_unlink(dparent->d_inode, dchild);
 
                 if (!rc && log_unlink)
-                        if (mds_log_op_unlink(obd, child_inode, req->rq_repmsg,
-                                              offset + 1) > 0)
+                        if (mds_log_op_unlink(obd, child_inode,
+                                lustre_msg_buf(req->rq_repmsg, offset + 1, 0),
+                                req->rq_repmsg->buflens[offset + 1],
+                                lustre_msg_buf(req->rq_repmsg, offset + 2, 0),
+                                req->rq_repmsg->buflens[offset + 2]) > 0)
                                 body->valid |= OBD_MD_FLCOOKIE;
                 break;
         }
@@ -1234,14 +1268,14 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
                 if (rc)
                         ldlm_lock_decref(&child_reuse_lockh, LCK_EX);
                 else
-                        ldlm_put_lock_into_req(req, &child_reuse_lockh, LCK_EX);
+                        ptlrpc_save_lock(req, &child_reuse_lockh, LCK_EX);
         case 2: /* child lock */
                 ldlm_lock_decref(&child_lockh, LCK_EX);
         case 1: /* child and parent dentry, parent lock */
                 if (rc)
                         ldlm_lock_decref(&parent_lockh, LCK_PW);
                 else
-                        ldlm_put_lock_into_req(req, &parent_lockh, LCK_PW);
+                        ptlrpc_save_lock(req, &parent_lockh, LCK_PW);
                 l_dput(dchild);
                 l_dput(dparent);
         case 0:
@@ -1353,8 +1387,8 @@ cleanup:
                         ldlm_lock_decref(&src_lockh, LCK_EX);
                         ldlm_lock_decref(&tgt_dir_lockh, LCK_EX);
                 } else {
-                        ldlm_put_lock_into_req(req, &src_lockh, LCK_EX);
-                        ldlm_put_lock_into_req(req, &tgt_dir_lockh, LCK_EX);
+                        ptlrpc_save_lock(req, &src_lockh, LCK_EX);
+                        ptlrpc_save_lock(req, &tgt_dir_lockh, LCK_EX);
                 }
         case 2: /* target dentry */
                 l_dput(de_tgt_dir);
@@ -1720,11 +1754,11 @@ cleanup:
                         ldlm_lock_decref(&(dlm_handles[0]), LCK_PW);
                 } else {
                         if (lock_count == 4)
-                                ldlm_put_lock_into_req(req,
-                                                    &(dlm_handles[3]), LCK_EX);
-                        ldlm_put_lock_into_req(req, &(dlm_handles[2]), LCK_EX);
-                        ldlm_put_lock_into_req(req, &(dlm_handles[1]), LCK_PW);
-                        ldlm_put_lock_into_req(req, &(dlm_handles[0]), LCK_PW);
+                                ptlrpc_save_lock(req,
+                                              &(dlm_handles[3]), LCK_EX);
+                        ptlrpc_save_lock(req, &(dlm_handles[2]), LCK_EX);
+                        ptlrpc_save_lock(req, &(dlm_handles[1]), LCK_PW);
+                        ptlrpc_save_lock(req, &(dlm_handles[0]), LCK_PW);
                 }
                 l_dput(de_new);
                 l_dput(de_old);
index 56e6dcc..330be73 100644 (file)
@@ -101,28 +101,22 @@ out_lock:
 }
 
 static int mds_osc_destroy_orphan(struct mds_obd *mds,
-                                  struct ptlrpc_request *request)
+                                  struct inode *inode,
+                                  struct lov_mds_md *lmm,
+                                  int lmm_size,
+                                  struct llog_cookie *logcookies,
+                                  int log_unlink)
 {
-        struct mds_body *body;
-        struct lov_mds_md *lmm = NULL;
         struct lov_stripe_md *lsm = NULL;
         struct obd_trans_info oti = { 0 };
         struct obdo *oa;
         int rc;
         ENTRY;
 
-        body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body));
-        if (!(body->valid & OBD_MD_FLEASIZE))
+        if (lmm_size == 0)
                 RETURN(0);
-        if (body->eadatasize == 0) {
-                CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n");
-                RETURN(rc = -EPROTO);
-        }
 
-        lmm = lustre_msg_buf(request->rq_repmsg, 1, body->eadatasize);
-        LASSERT(lmm != NULL);
-
-        rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, body->eadatasize);
+        rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, lmm_size);
         if (rc < 0) {
                 CERROR("Error unpack md %p\n", lmm);
                 RETURN(rc);
@@ -135,18 +129,12 @@ static int mds_osc_destroy_orphan(struct mds_obd *mds,
         if (oa == NULL)
                 GOTO(out_free_memmd, rc = -ENOMEM);
         oa->o_id = lsm->lsm_object_id;
-        oa->o_mode = body->mode & S_IFMT;
+        oa->o_mode = inode->i_mode & S_IFMT;
         oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
 
-        if (body->valid & OBD_MD_FLCOOKIE) {
+        if (log_unlink && logcookies) {
                 oa->o_valid |= OBD_MD_FLCOOKIE;
-                oti.oti_logcookies =
-                        lustre_msg_buf(request->rq_repmsg, 2,
-                                       sizeof(struct llog_cookie) *
-                                       lsm->lsm_stripe_count);
-                if (oti.oti_logcookies == NULL)
-                        oa->o_valid &= ~OBD_MD_FLCOOKIE;
-                        body->valid &= ~OBD_MD_FLCOOKIE;
+                oti.oti_logcookies = logcookies;
         }
 
         rc = obd_destroy(mds->mds_osc_exp, oa, lsm, &oti);
@@ -163,69 +151,88 @@ static int mds_unlink_orphan(struct obd_device *obd, struct dentry *dchild,
                              struct inode *inode, struct inode *pending_dir)
 {
         struct mds_obd *mds = &obd->u.mds;
-        struct mds_body *body;
+        struct lov_mds_md *lmm = NULL;
+        struct llog_cookie *logcookies = NULL;
+        int lmm_size = 0, log_unlink = 0;
         void *handle = NULL;
-        struct ptlrpc_request *req;
-        int lengths[3] = {sizeof(struct mds_body),
-                          mds->mds_max_mdsize,
-                          mds->mds_max_cookiesize};
-        int rc;
+        int rc, err;
         ENTRY;
 
         LASSERT(mds->mds_osc_obd != NULL);
-        OBD_ALLOC(req, sizeof(*req));
-        if (!req) {
-                CERROR("request allocation out of memory\n");
-                GOTO(err_alloc_req, rc = -ENOMEM);
-        }
-        rc = lustre_pack_reply(req, 3, lengths, NULL);
-        if (rc) {
-                CERROR("cannot pack request %d\n", rc);
-                GOTO(out_free_req, rc);
-        }
-        body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*body));
-        LASSERT(body != NULL);
 
-        mds_pack_inode2body(body, inode);
-        mds_pack_md(obd, req->rq_repmsg, 1, body, inode, 1);
+        OBD_ALLOC(lmm, mds->mds_max_mdsize);
+        if (lmm == NULL)
+                RETURN(-ENOMEM);
+
+        down(&inode->i_sem);
+        rc = fsfilt_get_md(obd, inode, lmm, mds->mds_max_mdsize);
+        up(&inode->i_sem);
+
+        if (rc < 0) {
+                CERROR("Error %d reading eadata for ino %lu\n",
+                       rc, inode->i_ino);
+                GOTO(out_free_lmm, rc);
+        } else if (rc > 0) {
+                lmm_size = rc;
+                rc = mds_convert_lov_ea(obd, inode, lmm, lmm_size);
+                if (rc > 0)
+                        lmm_size = rc;
+                rc = 0;
+        }
 
         handle = fsfilt_start(obd, pending_dir, FSFILT_OP_UNLINK_LOG, NULL);
         if (IS_ERR(handle)) {
                 rc = PTR_ERR(handle);
                 CERROR("error fsfilt_start: %d\n", rc);
                 handle = NULL;
-                GOTO(out_free_msg, rc);
+                GOTO(out_free_lmm, rc);
         }
 
-        if (S_ISDIR(inode->i_mode)) {
+        down(&inode->i_sem);
+        rc = fsfilt_get_md(obd, inode, lmm, mds->mds_max_mdsize);
+        up(&inode->i_sem);
+
+        if (rc < 0) {
+                CERROR("Error %d reading eadata for ino %lu\n",
+                       rc, inode->i_ino);
+                GOTO(out_free_lmm, rc);
+        } else if (rc > 0) {
+                lmm_size = rc;
+                rc = 0;
+        }
+
+        if (S_ISDIR(inode->i_mode))
                 rc = vfs_rmdir(pending_dir, dchild);
-        } else {
+        else
                 rc = vfs_unlink(pending_dir, dchild);
-        }
+
         if (rc)
                 CERROR("error %d unlinking orphan %*s from PENDING directory\n",
                        rc, dchild->d_name.len, dchild->d_name.name);
 
-        if ((body->valid & OBD_MD_FLEASIZE)) {
-                if (mds_log_op_unlink(obd, inode, req->rq_repmsg, 1) > 0)
-                        body->valid |= OBD_MD_FLCOOKIE;
+        if (!rc && lmm_size) {
+                OBD_ALLOC(logcookies, mds->mds_max_cookiesize);
+                if (logcookies == NULL)
+                        rc = -ENOMEM;
+                else if (mds_log_op_unlink(obd, inode, lmm,lmm_size,logcookies,
+                                           mds->mds_max_cookiesize) > 0)
+                        log_unlink = 1;
         }
-
-        if (handle) {
-                int err = fsfilt_commit(obd, pending_dir, handle, 0);
-                if (err) {
-                        CERROR("error committing orphan unlink: %d\n", err);
+        err = fsfilt_commit(obd, pending_dir, handle, 0);
+        if (err) {
+                CERROR("error committing orphan unlink: %d\n", err);
+                if (!rc)
                         rc = err;
-                        GOTO(out_free_msg, rc);
-                }
         }
-        rc = mds_osc_destroy_orphan(mds, req);
-out_free_msg:
-        OBD_FREE(req->rq_repmsg, req->rq_replen);
-        req->rq_repmsg = NULL;
-out_free_req:
-        OBD_FREE(req, sizeof(*req));
-err_alloc_req:
+        if (!rc) {
+                rc = mds_osc_destroy_orphan(mds, inode, lmm, lmm_size,
+                                            logcookies, log_unlink);
+        }
+
+        if (logcookies != NULL)
+                OBD_FREE(logcookies, mds->mds_max_cookiesize);
+out_free_lmm:
+        OBD_FREE(lmm, mds->mds_max_mdsize);
         RETURN(rc);
 }
 
index 51abb74..8743e72 100644 (file)
@@ -32,7 +32,6 @@
 #include <linux/obd_class.h>
 #include <linux/lustre_net.h>
 
-#define MGMT_NEVENTS     1024UL
 #define MGMT_NBUFS       128UL
 #define MGMT_BUFSIZE     8192
 #define MGMT_MAXREQSIZE  512
@@ -89,10 +88,11 @@ static int mgmt_setup(struct obd_device *obd, obd_count len, void *buf)
         if (mgmt_initialized)
                 RETURN(-EALREADY);
         
-        mgmt_service = ptlrpc_init_svc(MGMT_NEVENTS, MGMT_NBUFS, MGMT_BUFSIZE,
-                                       MGMT_MAXREQSIZE, MGMT_REQUEST_PORTAL,
-                                       MGMT_REPLY_PORTAL, mgmt_handler,
-                                       "mgmt", obd->obd_proc_entry);
+        mgmt_service = 
+                ptlrpc_init_svc(MGMT_NBUFS, MGMT_BUFSIZE, MGMT_MAXREQSIZE, 
+                                MGMT_REQUEST_PORTAL, MGMT_REPLY_PORTAL, 
+                                mgmt_handler, "mgmt",
+                                obd->obd_proc_entry);
         if (!mgmt_service) {
                 CERROR("Failed to start mgmt service\n");
                 RETURN(-ENOMEM);
index d7c3c1e..9fc783b 100644 (file)
@@ -1,20 +1,25 @@
 DEFS=
 MODULE = obdclass
 
-class_obd.o: lustre_build_version
-
 if LIBLUSTRE
-lib_LIBRARIES = liblustreclass.a
+
+noinst_LIBRARIES = liblustreclass.a
 liblustreclass_a_SOURCES = class_obd.c debug.c genops.c statfs_pack.c uuid.c 
 liblustreclass_a_SOURCES += lustre_handles.c lustre_peer.c lprocfs_status.c
 liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c
 liblustreclass_a_SOURCES += llog_lvfs.c #llog_ioctl.c rbtree.c
+liblustreclass_a_CFLAGS = -fPIC
+
+class_obd.c: lustre_build_version
 
 lustre_build_version:
-       echo '#define LUSTRE_VERSION 31' > $(top_builddir)/include/linux/lustre_build_version.h
+       echo '#define LUSTRE_VERSION 32' > $(top_builddir)/include/linux/lustre_build_version.h
        echo '#define BUILD_VERSION "1"' >> $(top_builddir)/include/linux/lustre_build_version.h
 
 else
+
+class_obd.o: lustre_build_version
+
 modulefs_DATA = lustre_build_version obdclass.o llog_test.o
 EXTRA_PROGRAMS = obdclass llog_test
 
index a807fd8..17beed2 100644 (file)
 #include <portals/list.h>
 #include "llog_internal.h"
 
+#ifndef __KERNEL__
+/* liblustre workaround */
+atomic_t portal_kmemory = {0};
+#endif
+
 struct semaphore obd_conf_sem;   /* serialize configuration commands */
 struct obd_device obd_dev[MAX_OBD_DEVICES];
 struct list_head obd_types;
@@ -199,12 +204,6 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
                 char *buf;
                 struct lustre_cfg *lcfg;
 
-                /* FIXME hack to liblustre dump, remove when switch
-                   to zeroconf */
-#ifndef __KERNEL__
-                data->ioc_pbuf1 = data->ioc_inlbuf1;
-                data->ioc_plen1 = data->ioc_inllen1;
-#endif
                 if (!data->ioc_plen1 || !data->ioc_pbuf1) {
                         CERROR("No config buffer passed!\n");
                         GOTO(out, err = -EINVAL);
index 615c102..d3b86bf 100644 (file)
@@ -413,6 +413,7 @@ void __class_export_put(struct obd_export *exp)
                 if (exp->exp_connection)
                         ptlrpc_put_connection_superhack(exp->exp_connection);
 
+                LASSERT(list_empty(&exp->exp_outstanding_replies));
                 LASSERT(list_empty(&exp->exp_handle.h_link));
                 obd_destroy_export(exp);
 
@@ -440,6 +441,7 @@ struct obd_export *class_new_export(struct obd_device *obd)
         export->exp_conn_cnt = 0;
         atomic_set(&export->exp_refcount, 2);
         export->exp_obd = obd;
+        INIT_LIST_HEAD(&export->exp_outstanding_replies);
         /* XXX this should be in LDLM init */
         INIT_LIST_HEAD(&export->exp_ldlm_data.led_held_locks);
 
index 3271968..ec32b11 100644 (file)
@@ -672,7 +672,7 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx,
         return 0;
 }
 
-static int llog_lvfs_create(struct llog_obd_ctxt *ctxt,struct llog_handle **res,
+static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res,
                             struct llog_logid *logid, char *name)
 {
         LBUG();
index fadf05b..4a96820 100644 (file)
@@ -145,6 +145,13 @@ void lprocfs_remove(struct proc_dir_entry *root)
 
                 rm_entry = temp;
                 temp = temp->parent;
+
+                /* Memory corruption once caused this to fail, and
+                   without this LASSERT we would loop here forever. */
+                LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
+                         "0x%p  %s/%s len %d\n", rm_entry,
+                         temp->name, rm_entry->name, strlen(rm_entry->name));
+
                 remove_proc_entry(rm_entry->name, rm_entry->parent);
                 if (temp == parent)
                         break;
index 46710aa..9156dc8 100644 (file)
@@ -133,7 +133,8 @@ int class_attach(struct lustre_cfg *lcfg)
         INIT_LIST_HEAD(&obd->obd_recovery_queue);
         INIT_LIST_HEAD(&obd->obd_delayed_reply_queue);
 
-        init_waitqueue_head(&obd->obd_commit_waitq);
+        spin_lock_init (&obd->obd_uncommitted_replies_lock);
+        INIT_LIST_HEAD (&obd->obd_uncommitted_replies);
 
         len = strlen(name) + 1;
         OBD_ALLOC(obd->obd_name, len);
@@ -257,10 +258,22 @@ static void dump_exports(struct obd_device *obd)
         struct obd_export *exp, *n;
 
         list_for_each_entry_safe(exp, n, &obd->obd_exports, exp_obd_chain) {
-                CERROR("%s: %p %s %d %d %p\n",
+                struct ptlrpc_reply_state *rs;
+                struct ptlrpc_reply_state *first_reply = NULL;
+                int                        nreplies = 0;
+
+                list_for_each_entry (rs, &exp->exp_outstanding_replies,
+                                     rs_exp_list) {
+                        if (nreplies == 0)
+                                first_reply = rs;
+                        nreplies++;
+                }
+
+                CERROR("%s: %p %s %d %d %d: %p %s\n",
                        obd->obd_name, exp, exp->exp_client_uuid.uuid,
                        atomic_read(&exp->exp_refcount),
-                       exp->exp_failed, exp->exp_outstanding_reply );
+                       exp->exp_failed, nreplies, first_reply,
+                       nreplies > 3 ? "..." : "");
         }
 }
 
index 08136d7..b9fa3b8 100644 (file)
@@ -6,8 +6,9 @@
 DEFS= 
 
 if LIBLUSTRE
-lib_LIBRARIES = libobdecho.a
+noinst_LIBRARIES = libobdecho.a
 libobdecho_a_SOURCES = echo_client.c 
+libobdecho_a_CFLAGS = -fPIC
 else
 MODULE = obdecho
 modulefs_DATA = obdecho.o
index c627f82..a922517 100644 (file)
@@ -351,15 +351,14 @@ int echo_commitrw(int cmd, struct obd_export *export, struct obdo *oa,
                         struct page *page = r->page;
                         void *addr;
 
-                        if (!page || !(addr = kmap(page)) ||
-                            !kern_addr_valid((unsigned long)addr)) {
-
-                                CERROR("bad page objid "LPU64":%p, buf %d/%d\n",
+                        if (page == NULL) {
+                                CERROR("null page objid "LPU64":%p, buf %d/%d\n",
                                        obj->ioo_id, page, j, obj->ioo_bufcnt);
-                                kunmap(page);
                                 GOTO(commitrw_cleanup, rc = -EFAULT);
                         }
 
+                        addr = kmap(page);
+
                         CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n",
                                r->page, addr, r->offset);
 
index d3d79ad..2f15e62 100644 (file)
@@ -423,6 +423,51 @@ echo_get_stripe_off_id (struct lov_stripe_md *lsm, obd_off *offp, obd_id *idp)
         *offp = offset * stripe_size + woffset % stripe_size;
 }
 
+static void echo_page_debug_setup(struct lov_stripe_md *lsm, 
+                                  struct page *page, int rw, obd_id id, 
+                                  obd_off offset, obd_off count)
+{
+        void *addr;
+        obd_off stripe_off;
+        obd_id stripe_id;
+
+        if (id == 0)
+                return;
+
+        addr = kmap(page);
+
+        if (rw == OBD_BRW_WRITE) {
+                stripe_off = offset;
+                stripe_id = id;
+                echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id);
+        } else {
+                stripe_off = 0xdeadbeef00c0ffeeULL;
+                stripe_id = 0xdeadbeef00c0ffeeULL;
+        }
+        page_debug_setup(addr, count, stripe_off, stripe_id);
+
+        kunmap(page);
+}
+
+static int echo_page_debug_check(struct lov_stripe_md *lsm, 
+                                  struct page *page, obd_id id, 
+                                  obd_off offset, obd_off count)
+{
+        obd_off stripe_off = offset;
+        obd_id stripe_id = id;
+        void *addr;
+        int rc;
+
+        if (id == 0)
+                return 0;
+
+        addr = kmap(page);
+        echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id);
+        rc = page_debug_check("test_brw", addr, count, stripe_off, stripe_id);
+        kunmap(page);
+        return rc;
+}
+
 static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
                             struct lov_stripe_md *lsm, obd_off offset,
                             obd_size count, struct obd_trans_info *oti)
@@ -434,13 +479,12 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
         obd_off                 off;
         int                     i;
         int                     rc;
-        int                     verify;
+        int                     verify = 0;
         int                     gfp_mask;
 
         /* oa_id  == 0    => speed test (no verification) else...
          * oa & 1         => use HIGHMEM
          */
-        verify = (oa->o_id != 0);
         gfp_mask = ((oa->o_id & 1) == 0) ? GFP_KERNEL : GFP_HIGHUSER;
 
         LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
@@ -473,48 +517,26 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
                 pgp->off = off;
                 pgp->flag = 0;
 
-                if (verify) {
-                        void *addr = kmap(pgp->pg);
-                        obd_off      stripe_off = off;
-                        obd_id       stripe_id = oa->o_id;
-
-                        if (rw == OBD_BRW_WRITE) {
-                                echo_get_stripe_off_id(lsm, &stripe_off,
-                                                       &stripe_id);
-                                page_debug_setup(addr, pgp->count,
-                                                 stripe_off, stripe_id);
-                        } else {
-                                page_debug_setup(addr, pgp->count,
-                                                 0xdeadbeef00c0ffeeULL,
-                                                 0xdeadbeef00c0ffeeULL);
-                        }
-                        kunmap(pgp->pg);
-                }
+                echo_page_debug_setup(lsm, pgp->pg, rw, oa->o_id, off, 
+                                      pgp->count);
         }
 
         rc = obd_brw(rw, ec->ec_exp, oa, lsm, npages, pga, oti);
 
  out:
-        if (rc != 0)
-                verify = 0;
+        if (rc == 0 && rw == OBD_BRW_READ)
+                verify = 1;
 
         for (i = 0, pgp = pga; i < npages; i++, pgp++) {
                 if (pgp->pg == NULL)
                         continue;
 
                 if (verify) {
-                        void    *addr = kmap(pgp->pg);
-                        obd_off  stripe_off = pgp->off;
-                        obd_id   stripe_id  = oa->o_id;
-                        int      vrc;
-
-                        echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id);
-                        vrc = page_debug_check("test_brw", addr, pgp->count,
-                                               stripe_off, stripe_id);
+                        int vrc;
+                        vrc = echo_page_debug_check(lsm, pgp->pg, oa->o_id,
+                                                    pgp->off, pgp->count);
                         if (vrc != 0 && rc == 0)
                                 rc = vrc;
-
-                        kunmap(pgp->pg);
                 }
                 __free_pages(pgp->pg, 0);
         }
@@ -623,6 +645,7 @@ struct echo_async_state {
         wait_queue_head_t       eas_waitq;
         struct list_head        eas_avail;
         struct obdo             eas_oa;
+        struct lov_stripe_md    *eas_lsm;
 };
 
 static int eas_should_wake(struct echo_async_state *eas)
@@ -675,6 +698,11 @@ static void ec_ap_completion(void *data, int cmd, int rc)
                 return;
         eas = eap->eap_eas;
 
+        if (cmd == OBD_BRW_READ)
+                echo_page_debug_check(eas->eas_lsm, eap->eap_page, 
+                                      eas->eas_oa.o_id, eap->eap_off, 
+                                      PAGE_SIZE);
+
         spin_lock_irqsave(&eas->eas_lock, flags);
         if (rc && !eas->eas_rc)
                 eas->eas_rc = rc;
@@ -731,6 +759,7 @@ static int echo_client_async_page(struct obd_export *exp, int rw,
         init_waitqueue_head(&eas.eas_waitq);
         eas.eas_in_flight = 0;
         eas.eas_rc = 0;
+        eas.eas_lsm = lsm;
         INIT_LIST_HEAD(&eas.eas_avail);
 
         /* prepare the group of pages that we're going to be keeping
@@ -740,6 +769,7 @@ static int echo_client_async_page(struct obd_export *exp, int rw,
                 if (page == NULL)
                         GOTO(out, rc = -ENOMEM);
 
+                page->private = 0;
                 list_add_tail(&page->list, &pages);
 
                 OBD_ALLOC(eap, sizeof(*eap));
@@ -749,7 +779,7 @@ static int echo_client_async_page(struct obd_export *exp, int rw,
                 eap->eap_magic = EAP_MAGIC;
                 eap->eap_page = page;
                 eap->eap_eas = &eas;
-                eap->eap_cookie = ERR_PTR(-ENOENT);
+                page->private = (unsigned long)eap;
                 list_add_tail(&eap->eap_item, &eas.eas_avail);
         }
 
@@ -775,10 +805,10 @@ static int echo_client_async_page(struct obd_export *exp, int rw,
                 spin_unlock_irqrestore(&eas.eas_lock, flags);
 
                 /* unbind the eap from its old page offset */
-                if (!IS_ERR(eap->eap_cookie)) {
+                if (eap->eap_cookie != NULL) {
                         obd_teardown_async_page(exp, lsm, NULL, 
                                                 eap->eap_cookie);
-                        eap->eap_cookie = ERR_PTR(-ENOENT);
+                        eap->eap_cookie = NULL;
                 }
 
                 eas.eas_next_offset += PAGE_SIZE;
@@ -793,6 +823,10 @@ static int echo_client_async_page(struct obd_export *exp, int rw,
                         break;
                 }
 
+                if (rw == OBD_BRW_WRITE)
+                        echo_page_debug_setup(lsm, eap->eap_page, rw, oa->o_id,
+                                              eap->eap_off, PAGE_SIZE);
+
                 /* always asserts urgent, which isn't quite right */
                 rc = obd_queue_async_io(exp, lsm, NULL, eap->eap_cookie,
                                         rw, 0, PAGE_SIZE, 0,
@@ -824,9 +858,9 @@ out:
                 struct page *page = list_entry(pos, struct page, list);
 
                 list_del(&page->list);
-                if (page->private) {
+                if (page->private != 0) {
                         eap = (struct echo_async_page *)page->private;
-                        if (!IS_ERR(eap->eap_cookie))
+                        if (eap->eap_cookie != NULL)
                                 obd_teardown_async_page(exp, lsm, NULL, 
                                                         eap->eap_cookie);
                         OBD_FREE(eap, sizeof(*eap));
@@ -886,23 +920,19 @@ static int echo_client_prep_commit(struct obd_export *exp, int rw,
 
                 for (i = 0; i < npages; i++) {
                         struct page *page = lnb[i].page;
-                        void *addr;
 
                         /* read past eof? */
                         if (page == NULL && lnb[i].rc == 0) 
                                 continue;
 
-                        addr = kmap(lnb[i].page);
-
                         if (rw == OBD_BRW_WRITE) 
-                                page_debug_setup(addr, PAGE_SIZE,
-                                                 rnb[i].offset, oa->o_id);
-                        else 
-                                err = page_debug_check("prep_commit", addr, 
-                                                 PAGE_SIZE, rnb[i].offset,
-                                                 oa->o_id);
-
-                        kunmap(lnb[i].page);
+                                echo_page_debug_setup(lsm, page, rw, oa->o_id, 
+                                                      rnb[i].offset, 
+                                                      rnb[i].len);
+                        else
+                                echo_page_debug_check(lsm, page, oa->o_id, 
+                                                      rnb[i].offset, 
+                                                      rnb[i].len);
                 }
 
                 ret = obd_commitrw(rw, exp, oa, 1, &ioo, npages, lnb, oti);
index 90a5d11..0cb3bcd 100644 (file)
@@ -6,8 +6,9 @@
 DEFS=
 
 if LIBLUSTRE
-lib_LIBRARIES = libosc.a
+noinst_LIBRARIES = libosc.a
 libosc_a_SOURCES = osc_request.c osc_lib.c osc_create.c osc_internal.h
+libosc_a_CFLAGS = -fPIC
 else
 MODULE = osc
 modulefs_DATA = osc.o
index b5f6392..68a2d35 100644 (file)
@@ -65,7 +65,7 @@ void osc_wake_cache_waiters(struct client_obd *cli);
 #ifdef __KERNEL__
 int lproc_osc_attach_seqstat(struct obd_device *dev);
 #else
-static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {}
+static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
 #endif
 
 #endif /* OSC_INTERNAL_H */
index e8dd043..666de06 100644 (file)
@@ -537,7 +537,12 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 {
         obd_flag bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT;
 
-        LASSERT(!(oa->o_valid & bits));
+        /* XXX obd_brw_internal() might reuse obdo in it's loop thus
+         * hit the following assert. any actual meaning of this? temporarily
+         * disable it.
+         * in kernel mode, probably VFS will prevent it happen.
+         */
+        //LASSERT(!(oa->o_valid & bits));
 
         oa->o_valid |= bits;
         spin_lock(&cli->cl_loi_list_lock);
@@ -645,34 +650,40 @@ static void handle_short_read(int nob_read, obd_count page_count,
         }
 }
 
-static int check_write_rcs(struct ptlrpc_request *request, int niocount,
+static int check_write_rcs(struct ptlrpc_request *request,
+                           int requested_nob, int niocount,
                            obd_count page_count, struct brw_page *pga)
 {
-        int    i;
-        int    *remote_rcs;
+        int    *remote_rcs, i;
 
         /* return error if any niobuf was in error */
         remote_rcs = lustre_swab_repbuf(request, 1,
                                         sizeof(*remote_rcs) * niocount, NULL);
         if (remote_rcs == NULL) {
-                CERROR ("Missing/short RC vector on BRW_WRITE reply\n");
-                return (-EPROTO);
+                CERROR("Missing/short RC vector on BRW_WRITE reply\n");
+                return(-EPROTO);
         }
-        if (lustre_msg_swabbed (request->rq_repmsg))
+        if (lustre_msg_swabbed(request->rq_repmsg))
                 for (i = 0; i < niocount; i++)
-                        __swab32s (&remote_rcs[i]);
+                        __swab32s(&remote_rcs[i]);
 
         for (i = 0; i < niocount; i++) {
                 if (remote_rcs[i] < 0)
-                        return (remote_rcs[i]);
+                        return(remote_rcs[i]);
 
                 if (remote_rcs[i] != 0) {
-                        CERROR ("rc[%d] invalid (%d) req %p\n",
+                        CERROR("rc[%d] invalid (%d) req %p\n",
                                 i, remote_rcs[i], request);
-                        return (-EPROTO);
+                        return(-EPROTO);
                 }
         }
 
+        if (request->rq_bulk->bd_nob_transferred != requested_nob) {
+                CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
+                       requested_nob, request->rq_bulk->bd_nob_transferred);
+                return(-EPROTO);
+        }
+
         return (0);
 }
 
@@ -750,11 +761,11 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa,
                 return (-ENOMEM);
 
         if (opc == OST_WRITE)
-                desc = ptlrpc_prep_bulk_imp(req, BULK_GET_SOURCE,
-                                            OST_BULK_PORTAL);
+                desc = ptlrpc_prep_bulk_imp (req, page_count,
+                                             BULK_GET_SOURCE, OST_BULK_PORTAL);
         else
-                desc = ptlrpc_prep_bulk_imp(req, BULK_PUT_SINK,
-                                            OST_BULK_PORTAL);
+                desc = ptlrpc_prep_bulk_imp (req, page_count,
+                                             BULK_PUT_SINK, OST_BULK_PORTAL);
         if (desc == NULL)
                 GOTO(out, rc = -ENOMEM);
         /* NB request now owns desc and will free it when it gets freed */
@@ -783,11 +794,8 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa,
                          pg_prev->pg, pg_prev->pg->private, pg_prev->pg->index,
                                  pg_prev->off);
 
-                rc = ptlrpc_prep_bulk_page(desc, pg->pg, pg->off & ~PAGE_MASK,
-                                           pg->count);
-                if (rc != 0)
-                        GOTO(out, rc);
-
+                ptlrpc_prep_bulk_page(desc, pg->pg, pg->off & ~PAGE_MASK,
+                                      pg->count);
                 requested_nob += pg->count;
 
                 if (i > 0 && can_merge_pages(pg_prev, pg)) {
@@ -856,8 +864,10 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, struct obdo *oa,
                         CERROR ("Unexpected +ve rc %d\n", rc);
                         RETURN(-EPROTO);
                 }
+                LASSERT (req->rq_bulk->bd_nob == requested_nob);
 
-                RETURN(check_write_rcs(req, niocount, page_count, pga));
+                RETURN(check_write_rcs(req, requested_nob, niocount,
+                                       page_count, pga));
         }
 
         if (rc > requested_nob) {
@@ -865,6 +875,12 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, struct obdo *oa,
                 RETURN(-EPROTO);
         }
 
+        if (rc != req->rq_bulk->bd_nob_transferred) {
+                CERROR ("Unexpected rc %d (%d transferred)\n",
+                        rc, req->rq_bulk->bd_nob_transferred);
+                return (-EPROTO);
+        }
+
         if (rc < requested_nob)
                 handle_short_read(rc, page_count, pga);
 
@@ -1361,6 +1377,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                 oap = list_entry(pos, struct osc_async_page, oap_pending_item);
                 ops = oap->oap_caller_ops;
 
+                LASSERT(oap->oap_magic == OAP_MAGIC);
+
                 /* in llite being 'ready' equates to the page being locked
                  * until completion unlocks it.  commit_write submits a page
                  * as not ready because its unlock will happen unconditionally
@@ -1472,6 +1490,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
         list_splice(&rpc_list, &aa->aa_oaps);
         INIT_LIST_HEAD(&rpc_list);
 
+#ifdef __KERNEL__
         if (cmd == OBD_BRW_READ) {
                 lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
                 lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_brw_in_flight);
@@ -1480,6 +1499,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                 lprocfs_oh_tally(&cli->cl_write_rpc_hist,
                                  cli->cl_brw_in_flight);
         }
+#endif
 
         spin_lock(&cli->cl_loi_list_lock);
 
index dfdcf1c..f5c5579 100644 (file)
@@ -70,7 +70,8 @@ void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req)
         for (ack_lock = oti->oti_ack_locks, i = 0; i < 4; i++, ack_lock++) {
                 if (!ack_lock->mode)
                         break;
-                ldlm_put_lock_into_req(req, &ack_lock->lock, ack_lock->mode);
+                /* XXX not even calling target_send_reply in some cases... */
+                ptlrpc_save_lock (req, &ack_lock->lock, ack_lock->mode);
         }
 }
 
@@ -417,7 +418,8 @@ static int ost_brw_read(struct ptlrpc_request *req)
         if (local_nb == NULL)
                 GOTO(out_pp_rnb, rc = -ENOMEM);
 
-        desc = ptlrpc_prep_bulk_exp(req, BULK_PUT_SOURCE, OST_BULK_PORTAL);
+        desc = ptlrpc_prep_bulk_exp (req, npages, 
+                                     BULK_PUT_SOURCE, OST_BULK_PORTAL);
         if (desc == NULL)
                 GOTO(out_local, rc = -ENOMEM);
 
@@ -439,11 +441,9 @@ static int ost_brw_read(struct ptlrpc_request *req)
                 nob += page_rc;
                 if (page_rc != 0) {             /* some data! */
                         LASSERT (local_nb[i].page != NULL);
-                        rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page,
-                                                   pp_rnb[i].offset& ~PAGE_MASK,
-                                                   page_rc);
-                        if (rc != 0)
-                                break;
+                        ptlrpc_prep_bulk_page(desc, local_nb[i].page,
+                                              pp_rnb[i].offset & (PAGE_SIZE - 1),
+                                              page_rc);
                 }
 
                 if (page_rc != pp_rnb[i].len) { /* short read */
@@ -455,16 +455,25 @@ static int ost_brw_read(struct ptlrpc_request *req)
         }
 
         if (rc == 0) {
-                rc = ptlrpc_bulk_put(desc);
+                rc = ptlrpc_start_bulk_transfer(desc);
                 if (rc == 0) {
                         lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
                                           ost_bulk_timeout, desc);
                         rc = l_wait_event(desc->bd_waitq,
-                                          ptlrpc_bulk_complete(desc), &lwi);
-                        if (rc) {
-                                LASSERT(rc == -ETIMEDOUT);
+                                          !ptlrpc_bulk_active(desc), &lwi);
+                        LASSERT(rc == 0 || rc == -ETIMEDOUT);
+                        if (rc == -ETIMEDOUT) {
                                 DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT");
                                 ptlrpc_abort_bulk(desc);
+                        } else if (!desc->bd_success ||
+                                   desc->bd_nob_transferred != desc->bd_nob) {
+                                DEBUG_REQ(D_ERROR, req, "%s bulk PUT %d(%d)",
+                                          desc->bd_success ? 
+                                          "truncated" : "network error on",
+                                          desc->bd_nob_transferred, 
+                                          desc->bd_nob);
+                                /* XXX should this be a different errno? */
+                                rc = -ETIMEDOUT;
                         }
                 } else {
                         DEBUG_REQ(D_ERROR, req, "bulk PUT failed: rc %d\n", rc);
@@ -502,9 +511,9 @@ static int ost_brw_read(struct ptlrpc_request *req)
                 req->rq_status = rc;
                 ptlrpc_error(req);
         } else {
-                if (req->rq_repmsg != NULL) {
+                if (req->rq_reply_state != NULL) {
                         /* reply out callback would free */
-                        OBD_FREE(req->rq_repmsg, req->rq_replen);
+                        lustre_free_reply_state (req->rq_reply_state);
                 }
                 if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) {
                         CERROR("bulk IO comms error: "
@@ -545,7 +554,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         int                      objcount, niocount, npages;
         int                      comms_error = 0;
         int                      rc, rc2, swab, i, j;
-        char                    str[PTL_NALFMT_SIZE];
+        char                     str[PTL_NALFMT_SIZE];
         ENTRY;
 
         if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK))
@@ -607,7 +616,8 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         if (local_nb == NULL)
                 GOTO(out_pp_rnb, rc = -ENOMEM);
 
-        desc = ptlrpc_prep_bulk_exp(req, BULK_GET_SINK, OST_BULK_PORTAL);
+        desc = ptlrpc_prep_bulk_exp (req, npages, 
+                                     BULK_GET_SINK, OST_BULK_PORTAL);
         if (desc == NULL)
                 GOTO(out_local, rc = -ENOMEM);
 
@@ -618,31 +628,34 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
 
         /* NB Having prepped, we must commit... */
 
-        for (i = 0; i < npages; i++) {
-                rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page,
-                                           pp_rnb[i].offset & (PAGE_SIZE - 1),
-                                           pp_rnb[i].len);
-                if (rc != 0)
-                        break;
-        }
+        for (i = 0; i < npages; i++)
+                ptlrpc_prep_bulk_page(desc, local_nb[i].page, 
+                                      pp_rnb[i].offset & (PAGE_SIZE - 1),
+                                      pp_rnb[i].len);
 
+        rc = ptlrpc_start_bulk_transfer (desc);
         if (rc == 0) {
-                rc = ptlrpc_bulk_get(desc);
-                if (rc == 0) {
-                        lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
-                                          ost_bulk_timeout, desc);
-                        rc = l_wait_event(desc->bd_waitq,
-                                          ptlrpc_bulk_complete(desc), &lwi);
-                        if (rc) {
-                                LASSERT(rc == -ETIMEDOUT);
-                                DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
-                                ptlrpc_abort_bulk(desc);
-                        }
-                } else {
-                        DEBUG_REQ(D_ERROR, req, "bulk GET failed: rc %d\n", rc);
+                lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
+                                  ost_bulk_timeout, desc);
+                rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), 
+                                  &lwi);
+                LASSERT(rc == 0 || rc == -ETIMEDOUT);
+                if (rc == -ETIMEDOUT) {
+                        DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
+                        ptlrpc_abort_bulk(desc);
+                } else if (!desc->bd_success ||
+                           desc->bd_nob_transferred != desc->bd_nob) {
+                        DEBUG_REQ(D_ERROR, req, "%s bulk GET %d(%d)",
+                                  desc->bd_success ? 
+                                  "truncated" : "network error on",
+                                  desc->bd_nob_transferred, desc->bd_nob);
+                        /* XXX should this be a different errno? */
+                        rc = -ETIMEDOUT;
                 }
-                comms_error = rc != 0;
+        } else {
+                DEBUG_REQ(D_ERROR, req, "ptlrpc_bulk_get failed: rc %d\n", rc);
         }
+        comms_error = rc != 0;
 
         repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repbody));
         memcpy(&repbody->oa, &body->oa, sizeof(repbody->oa));
@@ -710,9 +723,9 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 req->rq_status = rc;
                 ptlrpc_error(req);
         } else {
-                if (req->rq_repmsg != NULL) {
+                if (req->rq_reply_state != NULL) {
                         /* reply out callback would free */
-                        OBD_FREE (req->rq_repmsg, req->rq_replen);
+                        lustre_free_reply_state (req->rq_reply_state);
                 }
                 if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) {
                         CERROR("bulk IO comms error: "
@@ -806,8 +819,6 @@ out_pp_rnb:
         free_per_page_niobufs(npages, pp_rnb, remote_nb);
 out:
         if (rc) {
-                OBD_FREE(req->rq_repmsg, req->rq_replen);
-                req->rq_repmsg = NULL;
                 req->rq_status = rc;
                 ptlrpc_error(req);
         } else
@@ -1122,11 +1133,11 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf)
         if (rc < 0)
                 RETURN(rc);
 
-        ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS,
-                                           OST_BUFSIZE, OST_MAXREQSIZE,
-                                           OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
-                                           ost_handle, "ost", 
-                                           obddev->obd_proc_entry);
+        ost->ost_service = 
+                ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
+                                OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
+                                ost_handle, "ost",
+                                obddev->obd_proc_entry);
         if (ost->ost_service == NULL) {
                 CERROR("failed to start service\n");
                 RETURN(-ENOMEM);
@@ -1138,9 +1149,9 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf)
                 GOTO(out, rc = -EINVAL);
 
         ost->ost_create_service =
-                ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS, OST_BUFSIZE,
-                                OST_MAXREQSIZE, OST_CREATE_PORTAL,
-                                OSC_REPLY_PORTAL, ost_handle, "ost_create",
+                ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
+                                OST_CREATE_PORTAL, OSC_REPLY_PORTAL,
+                                ost_handle, "ost_create",
                                 obddev->obd_proc_entry);
         if (ost->ost_create_service == NULL) {
                 CERROR("failed to start OST create service\n");
index e955c33..c06bc8a 100644 (file)
@@ -333,6 +333,7 @@ AC_SUBST(SCIMACNAL)
 CFLAGS="$KCFLAGS"
 CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac $with_ib"
 
+AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
 AC_SUBST(MOD_LINK)
 AC_SUBST(LINUX25)
 AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
index 817936a..08f084a 100644 (file)
@@ -50,9 +50,8 @@ typedef enum {
         PTL_IOV_TOO_SMALL   = 31,
 
        PTL_EQ_INUSE        = 32,
-       PTL_MD_INUSE        = 33,
 
-        PTL_MAX_ERRNO       = 33
+        PTL_MAX_ERRNO       = 32
 } ptl_err_t;
 /* If you change these, you must update the string table in api-errno.c */
 
index 4052c0c..0bf557e 100644 (file)
@@ -18,47 +18,60 @@ struct nal_cb_t {
        lib_ni_t ni;
        void *nal_data;
        /*
-        * send:  Sends a preformatted header and user data to a
-        * specified remote process.
-        * Can overwrite iov.
+        * send: Sends a preformatted header and payload data to a
+        * specified remote process. The payload is scattered over 'niov'
+        * fragments described by iov, starting at 'offset' for 'mlen'
+        * bytes.  
+        * NB the NAL may NOT overwrite iov.  
+        * PTL_OK on success => NAL has committed to send and will call
+        * lib_finalize on completion
         */
-       int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
-                       ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
-                       unsigned int niov, struct iovec *iov, size_t mlen);
+       ptl_err_t (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                             ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                             unsigned int niov, struct iovec *iov, 
+                             size_t offset, size_t mlen);
 
        /* as send, but with a set of page fragments (NULL if not supported) */
-       int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
-                             ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
-                             unsigned int niov, ptl_kiov_t *iov, size_t mlen);
+       ptl_err_t (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                                   ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                                   unsigned int niov, ptl_kiov_t *iov, 
+                                   size_t offset, size_t mlen);
        /*
-        * recv: Receives an incoming message from a remote process
-        * Type of iov depends on options.  Can overwrite iov.
+        * recv: Receives an incoming message from a remote process.  The
+        * payload is to be received into the scattered buffer of 'niov'
+        * fragments described by iov, starting at 'offset' for 'mlen'
+        * bytes.  Payload bytes after 'mlen' up to 'rlen' are to be
+        * discarded.  
+        * NB the NAL may NOT overwrite iov.
+        * PTL_OK on success => NAL has committed to receive and will call
+        * lib_finalize on completion
         */
-       int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
-                       unsigned int niov, struct iovec *iov, size_t mlen
-                       size_t rlen);
+       ptl_err_t (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                             unsigned int niov, struct iovec *iov
+                             size_t offset, size_t mlen, size_t rlen);
 
        /* as recv, but with a set of page fragments (NULL if not supported) */
-       int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
-                             unsigned int niov, ptl_kiov_t *iov, size_t mlen
-                             size_t rlen);
+       ptl_err_t (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                                   unsigned int niov, ptl_kiov_t *iov
+                                   size_t offset, size_t mlen, size_t rlen);
        /*
         * read: Reads a block of data from a specified user address
         */
-       int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
-                       user_ptr src_addr, size_t len);
+       ptl_err_t (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
+                             user_ptr src_addr, size_t len);
 
        /*
         * write: Writes a block of data into a specified user address
         */
-       int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
-                        void *src_addr, size_t len);
+       ptl_err_t (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
+                              void *src_addr, size_t len);
 
        /*
         * callback: Calls an event callback
+        * NULL => lib calls eq's callback (if any) directly.
         */
-       int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
-                        ptl_event_t *ev);
+       void (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                            ptl_event_t *ev);
 
        /*
         *  malloc: Acquire a block of memory in a system independent
@@ -74,14 +87,14 @@ struct nal_cb_t {
         * type of *iov depends on options.
         * Set to NULL if not required.
         */
-       int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
-                      void **addrkey);
+       ptl_err_t (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
+                            void **addrkey);
        void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
                          void **addrkey);
 
        /* as (un)map, but with a set of page fragments */
-       int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
-                            void **addrkey);
+       ptl_err_t (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
+                                  void **addrkey);
        void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
                          void **addrkey);
 
index 3582b94..e9e4635 100644 (file)
@@ -19,7 +19,6 @@
 #include <portals/types.h>
 #include <linux/kp30.h>
 #include <portals/p30.h>
-#include <portals/errno.h>
 #include <portals/lib-types.h>
 #include <portals/lib-nal.h>
 #include <portals/lib-dispatch.h>
@@ -42,7 +41,7 @@ do {                                                    \
         nal->cb_sti(nal, flagsp);                       \
 }
 
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
 
 #define MAX_MES         2048
 #define MAX_MDS         2048
@@ -98,7 +97,7 @@ lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
 }
 
 static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
 {
         /* NEVER called with statelock held */
         unsigned long  flags;
@@ -142,8 +141,20 @@ lib_me_free (nal_cb_t *nal, lib_me_t *me)
 static inline lib_msg_t *
 lib_msg_alloc (nal_cb_t *nal)
 {
-        /* ALWAYS called with statelock held */
-        return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_msg_t     *msg;
+        
+        state_lock (nal, &flags);
+        msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs);
+        state_unlock (nal, &flags);
+
+        if (msg != NULL) {
+                /* NULL pointers, clear flags etc */
+                memset (msg, 0, sizeof (*msg));
+                msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+        }
+        return(msg);
 }
 
 static inline void
@@ -155,22 +166,13 @@ lib_msg_free (nal_cb_t *nal, lib_msg_t *msg)
 
 #else
 
-extern atomic_t      md_in_use_count;
-extern atomic_t      msg_in_use_count;
-extern atomic_t      me_in_use_count;
-extern atomic_t      eq_in_use_count;
-
 static inline lib_eq_t *
 lib_eq_alloc (nal_cb_t *nal)
 {
         /* NEVER called with statelock held */
         lib_eq_t *eq;
-        PORTAL_ALLOC(eq, sizeof(*eq));
-
-        if (eq == NULL)
-                return (NULL);
 
-        atomic_inc (&eq_in_use_count);
+        PORTAL_ALLOC(eq, sizeof(*eq));
         return (eq);
 }
 
@@ -178,21 +180,34 @@ static inline void
 lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&eq_in_use_count);
         PORTAL_FREE(eq, sizeof(*eq));
 }
 
 static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
 {
         /* NEVER called with statelock held */
         lib_md_t *md;
-        PORTAL_ALLOC(md, sizeof(*md));
-
-        if (md == NULL)
-                return (NULL);
-
-        atomic_inc (&md_in_use_count);
+        int       size;
+        int       niov;
+
+        if ((umd->options & PTL_MD_KIOV) != 0) {
+                niov = umd->niov;
+                size = offsetof(lib_md_t, md_iov.kiov[niov]);
+        } else {
+                niov = ((umd->options & PTL_MD_IOV) != 0) ?
+                       umd->niov : 1;
+                size = offsetof(lib_md_t, md_iov.iov[niov]);
+        }
+
+        PORTAL_ALLOC(md, size);
+
+        if (md != NULL) {
+                /* Set here in case of early free */
+                md->options = umd->options;
+                md->md_niov = niov;
+        }
+        
         return (md);
 }
 
@@ -200,8 +215,14 @@ static inline void
 lib_md_free (nal_cb_t *nal, lib_md_t *md)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&md_in_use_count);
-        PORTAL_FREE(md, sizeof(*md));
+        int       size;
+
+        if ((md->options & PTL_MD_KIOV) != 0)
+                size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]);
+        else
+                size = offsetof(lib_md_t, md_iov.iov[md->md_niov]);
+
+        PORTAL_FREE(md, size);
 }
 
 static inline lib_me_t *
@@ -209,12 +230,8 @@ lib_me_alloc (nal_cb_t *nal)
 {
         /* NEVER called with statelock held */
         lib_me_t *me;
-        PORTAL_ALLOC(me, sizeof(*me));
-
-        if (me == NULL)
-                return (NULL);
 
-        atomic_inc (&me_in_use_count);
+        PORTAL_ALLOC(me, sizeof(*me));
         return (me);
 }
 
@@ -222,21 +239,21 @@ static inline void
 lib_me_free(nal_cb_t *nal, lib_me_t *me)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&me_in_use_count);
         PORTAL_FREE(me, sizeof(*me));
 }
 
 static inline lib_msg_t *
 lib_msg_alloc(nal_cb_t *nal)
 {
-        /* ALWAYS called with statelock held */
+        /* NEVER called with statelock held */
         lib_msg_t *msg;
-        PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg));
 
-        if (msg == NULL)
-                return (NULL);
-        
-        atomic_inc (&msg_in_use_count);
+        PORTAL_ALLOC(msg, sizeof(*msg));
+        if (msg != NULL) {
+                /* NULL pointers, clear flags etc */
+                memset (msg, 0, sizeof (*msg));
+                msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+        }
         return (msg);
 }
 
@@ -244,7 +261,6 @@ static inline void
 lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
 {
         /* ALWAYS called with statelock held */
-        atomic_dec (&msg_in_use_count);
         PORTAL_FREE(msg, sizeof(*msg));
 }
 #endif
@@ -344,26 +360,41 @@ extern char *dispatch_name(int index);
  * Call backs will be made to write events, send acks or
  * replies and so on.
  */
-extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
-extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void lib_enq_event_locked (nal_cb_t *nal, void *private,
+                                  lib_eq_t *eq, ptl_event_t *ev);
+extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, 
+                          ptl_err_t status);
+extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private);
 extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, 
                                       lib_md_t *getmd);
-extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr);
+
 
 extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
-extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
-extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, 
+                              ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, 
+                              char *src, ptl_size_t len);
+extern int lib_extract_iov (int dst_niov, struct iovec *dst,
+                            int src_niov, struct iovec *src,
+                            ptl_size_t offset, ptl_size_t len);
 
 extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
-extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
-extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, 
+                               ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+                               char *src, ptl_size_t len);
+extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, 
+                             int src_niov, ptl_kiov_t *src,
+                             ptl_size_t offset, ptl_size_t len);
+
 extern void lib_assert_wire_constants (void);
 
-extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
-                      ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
-extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
-                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-                     lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+                           ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                           ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                           lib_md_t *md, ptl_size_t offset, ptl_size_t len);
 
 extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
                                ptl_md_t * md_out);
index d9e3c11..904204b 100644 (file)
@@ -16,7 +16,7 @@
 # include <linux/smp_lock.h>
 # include <linux/types.h>
 #else
-# define PTL_USE_DESC_LISTS
+# define PTL_USE_LIB_FREELIST
 # include <sys/types.h>
 #endif
 
@@ -139,16 +139,9 @@ typedef struct {
 
 struct lib_msg_t {
         struct list_head  msg_list;
-        int               send_ack;
         lib_md_t         *md;
-        ptl_nid_t         nid;
-        ptl_pid_t         pid;
-        ptl_event_t       ev;
         ptl_handle_wire_t ack_wmd;
-        union {
-                struct iovec  iov[PTL_MD_MAX_IOV];
-                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
-        } msg_iov;
+        ptl_event_t       ev;
 };
 
 struct lib_ptl_t {
@@ -212,9 +205,8 @@ struct lib_md_t {
 };
 
 #define PTL_MD_FLAG_UNLINK            (1 << 0)
-#define PTL_MD_FLAG_AUTO_UNLINKED     (1 << 1)
 
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
 typedef struct
 {
         void             *fl_objs;             /* single contiguous array of objects */
@@ -262,7 +254,7 @@ typedef struct {
         
         struct list_head ni_test_peers;
         
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
         lib_freelist_t   ni_free_mes;
         lib_freelist_t   ni_free_msgs;
         lib_freelist_t   ni_free_mds;
index a4ea39b..8b1495e 100644 (file)
@@ -21,7 +21,6 @@
 #include <portals/types.h>
 #include <portals/nal.h>
 #include <portals/api.h>
-#include <portals/errno.h>
 #include <portals/nalids.h>
 
 extern int __p30_initialized;  /* for libraries & test codes  */
index e4ccebf..7ffe797 100644 (file)
@@ -17,6 +17,8 @@ typedef u_int64_t __u64;
 # define do_gettimeofday(tv) gettimeofday(tv, NULL)
 #endif
 
+#include <portals/errno.h>
+
 typedef __u64 ptl_nid_t;
 typedef __u32 ptl_pid_t;
 typedef __u32 ptl_pt_index_t;
@@ -97,7 +99,8 @@ typedef enum {
         PTL_EVENT_PUT,
         PTL_EVENT_REPLY,
         PTL_EVENT_ACK,
-        PTL_EVENT_SENT
+        PTL_EVENT_SENT,
+       PTL_EVENT_UNLINK,
 } ptl_event_kind_t;
 
 #define PTL_SEQ_BASETYPE       long
@@ -112,15 +115,19 @@ typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t;
 #pragma pack(push, 4)
 #endif
 typedef struct {
-        ptl_event_kind_t type;
-        ptl_process_id_t initiator;
-        ptl_pt_index_t portal;
-        ptl_match_bits_t match_bits;
-        ptl_size_t rlength, mlength, offset;
-        ptl_handle_me_t unlinked_me;
-        ptl_md_t mem_desc;
-        ptl_hdr_data_t hdr_data;
-        struct timeval arrival_time;
+        ptl_event_kind_t   type;
+       ptl_err_t          status;
+       int                unlinked;
+        ptl_process_id_t   initiator;
+        ptl_pt_index_t     portal;
+        ptl_match_bits_t   match_bits;
+        ptl_size_t         rlength;
+       ptl_size_t         mlength;
+       ptl_size_t         offset;
+        ptl_md_t           mem_desc;
+        ptl_hdr_data_t     hdr_data;
+        struct timeval     arrival_time;
+
         volatile ptl_seq_t sequence;
 } ptl_event_t;
 #ifdef __CYGWIN__
index 53757ab..cdde5b7 100644 (file)
@@ -353,8 +353,6 @@ int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t);
 
 int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t);
 
-int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *);
-
 void *gmnal_cb_malloc(nal_cb_t *, size_t);
 
 void gmnal_cb_free(nal_cb_t *, void *, size_t);
@@ -384,7 +382,7 @@ void  gmnal_fini(void);
                                a->cb_recv_pages = gmnal_cb_recv_pages; \
                                a->cb_read = gmnal_cb_read; \
                                a->cb_write = gmnal_cb_write; \
-                               a->cb_callback = gmnal_cb_callback; \
+                               a->cb_callback = NULL; \
                                a->cb_malloc = gmnal_cb_malloc; \
                                a->cb_free = gmnal_cb_free; \
                                a->cb_map = NULL; \
index 6ae91db..e055242 100644 (file)
@@ -126,7 +126,6 @@ int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
                                niov, iov, len);
        } else {
                CDEBUG(D_ERROR, "Large message send it is not supported\n");
-               lib_finalize(nal_cb, private, cookie);
                return(PTL_FAIL);
                gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, 
                                niov, iov, len);
@@ -200,18 +199,6 @@ int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst,
        return(PTL_OK);
 }
 
-int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, 
-                      ptl_event_t *ev)
-{
-
-       if (eq->event_callback != NULL) {
-               CDEBUG(D_INFO, "found callback\n");
-               eq->event_callback(ev);
-       }
-       
-       return(PTL_OK);
-}
-
 void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len)
 {
        void *ptr = NULL;
index 4171df6..a0d3530 100644 (file)
@@ -321,7 +321,6 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
 
        if (!private) {
                CDEBUG(D_ERROR, "gmnal_small_rx no context\n");
-               lib_finalize(nal_cb, private, cookie);
                return(PTL_FAIL);
        }
 
@@ -343,10 +342,8 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
         *      let portals library know receive is complete
         */
        CDEBUG(D_PORTALS, "calling lib_finalize\n");
-       if (lib_finalize(nal_cb, private, cookie) != PTL_OK) {
-               /* TO DO what to do with failed lib_finalise? */
-               CDEBUG(D_INFO, "lib_finalize failed\n");
-       }
+       lib_finalize(nal_cb, private, cookie, PTL_OK);
+
        /*
         *      return buffer so it can be used again
         */
@@ -590,10 +587,8 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status)
                return;
        }
        gmnal_return_stxd(nal_data, stxd);
-       if (lib_finalize(nal_cb, stxd, cookie) != PTL_OK) {
-               CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n", 
-                      stxd);
-       }
+       lib_finalize(nal_cb, stxd, cookie, PTL_OK);
+
        return;
 }
 
@@ -817,7 +812,6 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
 
        if (!srxd) {
                CDEBUG(D_ERROR, "gmnal_large_rx no context\n");
-               lib_finalize(nal_cb, private, cookie);
                return(PTL_FAIL);
        }
 
@@ -1114,10 +1108,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context,
         *      Let our client application proceed
         */     
        CDEBUG(D_ERROR, "final callback context[%p]\n", srxd);
-       if (lib_finalize(nal_cb, srxd, srxd->cookie) != PTL_OK) {
-               CDEBUG(D_INFO, "Call to lib_finalize failed for srxd [%p]\n", 
-                      srxd);
-       }
+       lib_finalize(nal_cb, srxd, srxd->cookie, PTL_OK);
 
        /*
         *      send an ack to the sender to let him know we got the data
@@ -1282,10 +1273,7 @@ gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd)
 
        CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd);
 
-       if (lib_finalize(nal_cb, stxd, stxd->cookie) != PTL_OK) {
-               CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n", 
-                      stxd);
-       }
+       lib_finalize(nal_cb, stxd, stxd->cookie, PTL_OK);
 
        /*
         *      extract the iovec from the stxd, deregister the memory.
index 2c07cc4..0688062 100644 (file)
@@ -306,7 +306,7 @@ kibnal_send(nal_cb_t        *nal,
           if(buf_length > MAX_MSG_SIZE) { 
              CERROR("kibnal_send:request exceeds Transmit data size (%d).\n",
                       MAX_MSG_SIZE);
-             rc = -1;
+             rc = PTL_FAIL;
              return rc;
           }
           else {
@@ -363,7 +363,7 @@ kibnal_send(nal_cb_t        *nal,
 
         PROF_FINISH(kibnal_send); // time stapm of send operation 
 
-        rc = 1;
+        rc = PTL_OK;
 
         return rc; 
 }
@@ -386,7 +386,7 @@ int kibnal_send_pages(nal_cb_t * nal,
                       ptl_kiov_t *iov, 
                       size_t mlen)
 {
-   int rc = 1;
+   int rc = PTL_FAIL;
 
    CDEBUG(D_NET, "kibnal_send_pages\n");
 
@@ -420,7 +420,7 @@ void kibnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
 //
 // do you need this 
 //
-int kibnal_callback(nal_cb_t * nal, 
+void kibnal_callback(nal_cb_t * nal, 
                            void *private, 
                            lib_eq_t *eq,
                            ptl_event_t *ev)
@@ -507,7 +507,7 @@ kibnal_recv_pages(nal_cb_t * nal,
 {
 
   CDEBUG(D_NET, "recv_pages not implemented\n");
-  return PTL_OK;
+  return PTL_FAIL;
        
 }
 
@@ -526,11 +526,12 @@ kibnal_recv(nal_cb_t     *nal,
         CDEBUG(D_NET,"kibnal_recv: mlen=%d, rlen=%d\n", mlen, rlen);
 
         /* What was actually received must be >= what sender claims to
-         * have sent.  This is an LASSERT, since lib-move doesn't
-         * check cb return code yet. */
-        LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
+         * have sent. */
         LASSERT (mlen <= rlen);
 
+        if (krx->krx_len < sizeof (ptl_hdr_t) + rlen)
+                return (PTL_FAIL);
+
         PROF_START(kibnal_recv);
 
         if(mlen != 0) {
@@ -542,12 +543,12 @@ kibnal_recv(nal_cb_t     *nal,
 
         PROF_START(lib_finalize);
         
-        lib_finalize(nal, private, cookie);
+        lib_finalize(nal, private, cookie, PTL_OK);
         
         PROF_FINISH(lib_finalize);
         PROF_FINISH(kibnal_recv);
 
-        return rlen;
+        return PTL_OK;
 }
 
 //
index 96749cd..4c2bd6a 100644 (file)
@@ -33,7 +33,7 @@ EP_STATUSBLK  kqswnal_rpc_failed;
  *  LIB functions follow
  *
  */
-static int
+static ptl_err_t
 kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
              size_t len)
 {
@@ -41,10 +41,10 @@ kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
                 nal->ni.nid, len, src_addr, dst_addr );
         memcpy( dst_addr, src_addr, len );
 
-        return (0);
+        return (PTL_OK);
 }
 
-static int
+static ptl_err_t
 kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
               size_t len)
 {
@@ -52,7 +52,7 @@ kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
                 nal->ni.nid, len, src_addr, dst_addr );
         memcpy( dst_addr, src_addr, len );
 
-        return (0);
+        return (PTL_OK);
 }
 
 static void *
@@ -157,13 +157,12 @@ kqswnal_unmap_tx (kqswnal_tx_t *ktx)
         elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState,
                           kqswnal_data.kqn_eptxdmahandle,
                           ktx->ktx_basepage, ktx->ktx_nmappedpages);
-
 #endif
         ktx->ktx_nmappedpages = 0;
 }
 
 int
-kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
+kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_t *kiov)
 {
         int       nfrags    = ktx->ktx_nfrag;
         int       nmapped   = ktx->ktx_nmappedpages;
@@ -188,8 +187,16 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
         LASSERT (niov > 0);
         LASSERT (nob > 0);
 
+        /* skip complete frags before 'offset' */
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                kiov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+
         do {
-                int  fraglen = kiov->kiov_len;
+                int  fraglen = kiov->kiov_len - offset;
 
                 /* nob exactly spans the iovs */
                 LASSERT (fraglen <= nob);
@@ -212,7 +219,7 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
                 /* XXX this is really crap, but we'll have to kmap until
                  * EKC has a page (rather than vaddr) mapping interface */
 
-                ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
 
                 CDEBUG(D_NET,
                        "%p[%d] loading %p for %d, page %d, %d total\n",
@@ -257,6 +264,7 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
                 kiov++;
                 niov--;
                 nob -= fraglen;
+                offset = 0;
 
                 /* iov must not run out before end of data */
                 LASSERT (nob == 0 || niov > 0);
@@ -271,7 +279,8 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
 }
 
 int
-kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
+kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, 
+                    int niov, struct iovec *iov)
 {
         int       nfrags    = ktx->ktx_nfrag;
         int       nmapped   = ktx->ktx_nmappedpages;
@@ -295,8 +304,16 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
         LASSERT (niov > 0);
         LASSERT (nob > 0);
 
+        /* skip complete frags before offset */
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                iov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+        
         do {
-                int  fraglen = iov->iov_len;
+                int  fraglen = iov->iov_len - offset;
                 long npages  = kqswnal_pages_spanned (iov->iov_base, fraglen);
 
                 /* nob exactly spans the iovs */
@@ -317,12 +334,12 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
 
                 CDEBUG(D_NET,
                        "%p[%d] loading %p for %d, pages %d for %ld, %d total\n",
-                        ktx, nfrags, iov->iov_base, fraglen, basepage, npages,
-                        nmapped);
+                       ktx, nfrags, iov->iov_base + offset, fraglen, 
+                       basepage, npages, nmapped);
 
 #if MULTIRAIL_EKC
                 ep_dvma_load(kqswnal_data.kqn_ep, NULL,
-                             iov->iov_base, fraglen,
+                             iov->iov_base + offset, fraglen,
                              kqswnal_data.kqn_ep_tx_nmh, basepage,
                              &railmask, &ktx->ktx_frags[nfrags]);
 
@@ -336,7 +353,7 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
 #else
                 elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState,
                                        kqswnal_data.kqn_eptxdmahandle,
-                                       iov->iov_base, fraglen,
+                                       iov->iov_base + offset, fraglen,
                                        basepage, &ktx->ktx_frags[nfrags].Base);
 
                 if (nfrags > 0 &&                /* previous frag mapped */
@@ -357,6 +374,7 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
                 iov++;
                 niov--;
                 nob -= fraglen;
+                offset = 0;
 
                 /* iov must not run out before end of data */
                 LASSERT (nob == 0 || niov > 0);
@@ -483,7 +501,7 @@ void
 kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
 {
         lib_msg_t     *msg;
-        lib_msg_t     *repmsg;
+        lib_msg_t     *repmsg = NULL;
 
         switch (ktx->ktx_state) {
         case KTX_FORWARDING:       /* router asked me to forward this packet */
@@ -493,21 +511,29 @@ kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
 
         case KTX_SENDING:          /* packet sourced locally */
                 lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
-                              (lib_msg_t *)ktx->ktx_args[1]);
+                              (lib_msg_t *)ktx->ktx_args[1],
+                              (error == 0) ? PTL_OK : 
+                              (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL);
                 break;
 
         case KTX_GETTING:          /* Peer has DMA-ed direct? */
                 msg = (lib_msg_t *)ktx->ktx_args[1];
-                repmsg = NULL;
 
-                if (error == 0) 
+                if (error == 0) {
                         repmsg = lib_fake_reply_msg (&kqswnal_lib, 
                                                      ktx->ktx_nid, msg->md);
+                        if (repmsg == NULL)
+                                error = -ENOMEM;
+                }
                 
-                lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg);
-
-                if (repmsg != NULL) 
-                        lib_finalize (&kqswnal_lib, NULL, repmsg);
+                if (error == 0) {
+                        lib_finalize (&kqswnal_lib, ktx->ktx_args[0], 
+                                      msg, PTL_OK);
+                        lib_finalize (&kqswnal_lib, NULL, repmsg, PTL_OK);
+                } else {
+                        lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg,
+                                      (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL);
+                }
                 break;
 
         default:
@@ -533,7 +559,7 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
                         ktx->ktx_nid, status);
 
                 kqswnal_notify_peer_down(ktx);
-                status = -EIO;
+                status = -EHOSTDOWN;
 
         } else if (ktx->ktx_state == KTX_GETTING) {
                 /* RPC completed OK; what did our peer put in the status
@@ -745,7 +771,8 @@ kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv,
 
 int
 kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, 
-                   struct iovec *iov, ptl_kiov_t *kiov, int nob)
+                   struct iovec *iov, ptl_kiov_t *kiov, 
+                   int offset, int nob)
 {
         kqswnal_rx_t       *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
         char               *buffer = (char *)page_address(krx->krx_pages[0]);
@@ -779,9 +806,9 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag,
         /* Map the source data... */
         ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
         if (kiov != NULL)
-                rc = kqswnal_map_tx_kiov (ktx, nob, nfrag, kiov);
+                rc = kqswnal_map_tx_kiov (ktx, offset, nob, nfrag, kiov);
         else
-                rc = kqswnal_map_tx_iov (ktx, nob, nfrag, iov);
+                rc = kqswnal_map_tx_iov (ktx, offset, nob, nfrag, iov);
 
         if (rc != 0) {
                 CERROR ("Can't map source data: %d\n", rc);
@@ -846,7 +873,7 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag,
         return (-ECONNABORTED);
 }
 
-static int
+static ptl_err_t
 kqswnal_sendmsg (nal_cb_t     *nal,
                  void         *private,
                  lib_msg_t    *libmsg,
@@ -857,6 +884,7 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                  unsigned int  payload_niov,
                  struct iovec *payload_iov,
                  ptl_kiov_t   *payload_kiov,
+                 size_t        payload_offset,
                  size_t        payload_nob)
 {
         kqswnal_tx_t      *ktx;
@@ -865,6 +893,7 @@ kqswnal_sendmsg (nal_cb_t     *nal,
 #if KQSW_CHECKSUM
         int                i;
         kqsw_csum_t        csum;
+        int                sumoff;
         int                sumnob;
 #endif
         
@@ -928,9 +957,9 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 }
 
                 /* peer expects RPC completion with GET data */
-                rc = kqswnal_dma_reply (ktx,
-                                        payload_niov, payload_iov, 
-                                        payload_kiov, payload_nob);
+                rc = kqswnal_dma_reply (ktx, payload_niov, 
+                                        payload_iov, payload_kiov, 
+                                        payload_offset, payload_nob);
                 if (rc == 0)
                         return (PTL_OK);
                 
@@ -945,22 +974,39 @@ kqswnal_sendmsg (nal_cb_t     *nal,
 #if KQSW_CHECKSUM
         csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr));
         memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum));
-        for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) {
+        for (csum = 0, i = 0, sumoff = payload_offset, sumnob = payload_nob; sumnob > 0; i++) {
+                LASSERT(i < niov);
                 if (payload_kiov != NULL) {
                         ptl_kiov_t *kiov = &payload_kiov[i];
-                        char       *addr = ((char *)kmap (kiov->kiov_page)) +
-                                           kiov->kiov_offset;
-                        
-                        csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len));
-                        sumnob -= kiov->kiov_len;
+
+                        if (sumoff >= kiov->kiov_len) {
+                                sumoff -= kiov->kiov_len;
+                        } else {
+                                char *addr = ((char *)kmap (kiov->kiov_page)) +
+                                             kiov->kiov_offset + sumoff;
+                                int   fragnob = kiov->kiov_len - sumoff;
+
+                                csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob));
+                                sumnob -= fragnob;
+                                sumoff = 0;
+                                kunmap(kiov->kiov_page);
+                        }
                 } else {
                         struct iovec *iov = &payload_iov[i];
 
-                        csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len));
-                        sumnob -= iov->iov_len;
+                        if (sumoff > iov->iov_len) {
+                                sumoff -= iov->iov_len;
+                        } else {
+                                char *addr = iov->iov_base + sumoff;
+                                int   fragnob = iov->iov_len - sumoff;
+                                
+                                csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob));
+                                sumnob -= fragnob;
+                                sumoff = 0;
+                        }
                 }
         }
-        memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum));
+        memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum));
 #endif
         
         if (kqswnal_data.kqn_optimized_gets &&
@@ -987,10 +1033,10 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 ktx->ktx_state = KTX_GETTING;
 
                 if ((libmsg->md->options & PTL_MD_KIOV) != 0) 
-                        rc = kqswnal_map_tx_kiov (ktx, md->length,
+                        rc = kqswnal_map_tx_kiov (ktx, 0, md->length,
                                                   md->md_niov, md->md_iov.kiov);
                 else
-                        rc = kqswnal_map_tx_iov (ktx, md->length,
+                        rc = kqswnal_map_tx_iov (ktx, 0, md->length,
                                                  md->md_niov, md->md_iov.iov);
 
                 if (rc < 0) {
@@ -1033,10 +1079,12 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 if (payload_nob > 0) {
                         if (payload_kiov != NULL)
                                 lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
-                                                   payload_niov, payload_kiov, payload_nob);
+                                                   payload_niov, payload_kiov, 
+                                                   payload_offset, payload_nob);
                         else
                                 lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
-                                                  payload_niov, payload_iov, payload_nob);
+                                                  payload_niov, payload_iov, 
+                                                  payload_offset, payload_nob);
                 }
         } else {
 
@@ -1052,10 +1100,10 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 ktx->ktx_frags[0].Len = KQSW_HDR_SIZE;
 #endif
                 if (payload_kiov != NULL)
-                        rc = kqswnal_map_tx_kiov (ktx, payload_nob, 
+                        rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob, 
                                                   payload_niov, payload_kiov);
                 else
-                        rc = kqswnal_map_tx_iov (ktx, payload_nob,
+                        rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob,
                                                  payload_niov, payload_iov);
                 if (rc != 0) {
                         kqswnal_put_idle_tx (ktx);
@@ -1078,7 +1126,7 @@ kqswnal_sendmsg (nal_cb_t     *nal,
         return (PTL_OK);
 }
 
-static int
+static ptl_err_t
 kqswnal_send (nal_cb_t     *nal,
               void         *private,
               lib_msg_t    *libmsg,
@@ -1088,13 +1136,15 @@ kqswnal_send (nal_cb_t     *nal,
               ptl_pid_t     pid,
               unsigned int  payload_niov,
               struct iovec *payload_iov,
+              size_t        payload_offset,
               size_t        payload_nob)
 {
         return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid,
-                                 payload_niov, payload_iov, NULL, payload_nob));
+                                 payload_niov, payload_iov, NULL, 
+                                 payload_offset, payload_nob));
 }
 
-static int
+static ptl_err_t
 kqswnal_send_pages (nal_cb_t     *nal,
                     void         *private,
                     lib_msg_t    *libmsg,
@@ -1104,10 +1154,12 @@ kqswnal_send_pages (nal_cb_t     *nal,
                     ptl_pid_t     pid,
                     unsigned int  payload_niov,
                     ptl_kiov_t   *payload_kiov,
+                    size_t        payload_offset,
                     size_t        payload_nob)
 {
         return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid,
-                                 payload_niov, NULL, payload_kiov, payload_nob));
+                                 payload_niov, NULL, payload_kiov, 
+                                 payload_offset, payload_nob));
 }
 
 void
@@ -1161,7 +1213,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
             nob <= KQSW_TX_BUFFER_SIZE) 
         {
                 /* send from ktx's pre-mapped contiguous buffer? */
-                lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob);
+                lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, 0, nob);
 #if MULTIRAIL_EKC
                 ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
                               0, nob);
@@ -1176,7 +1228,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
         {
                 /* zero copy */
                 ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
-                rc = kqswnal_map_tx_iov (ktx, nob, niov, iov);
+                rc = kqswnal_map_tx_iov (ktx, 0, nob, niov, iov);
                 if (rc != 0)
                         goto failed;
 
@@ -1231,7 +1283,8 @@ kqswnal_dma_reply_complete (EP_RXD *rxd)
         krx->krx_rpc_reply_needed = 0;
         kqswnal_rx_done (krx);
 
-        lib_finalize (&kqswnal_lib, NULL, msg);
+        lib_finalize (&kqswnal_lib, NULL, msg,
+                      (status == EP_SUCCESS) ? PTL_OK : PTL_FAIL);
         kqswnal_put_idle_tx (ktx);
 }
 
@@ -1461,13 +1514,14 @@ kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr)
 }
 #endif
 
-static int
+static ptl_err_t
 kqswnal_recvmsg (nal_cb_t     *nal,
                  void         *private,
                  lib_msg_t    *libmsg,
                  unsigned int  niov,
                  struct iovec *iov,
                  ptl_kiov_t   *kiov,
+                 size_t        offset,
                  size_t        mlen,
                  size_t        rlen)
 {
@@ -1498,10 +1552,13 @@ kqswnal_recvmsg (nal_cb_t     *nal,
 #endif
         CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen);
 
-        /* What was actually received must be >= payload.
-         * This is an LASSERT, as lib_finalize() doesn't have a completion status. */
-        LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen);
+        /* What was actually received must be >= payload. */
         LASSERT (mlen <= rlen);
+        if (krx->krx_nob < KQSW_HDR_SIZE + mlen) {
+                CERROR("Bad message size: have %d, need %d + %d\n",
+                       krx->krx_nob, KQSW_HDR_SIZE, mlen);
+                return (PTL_FAIL);
+        }
 
         /* It must be OK to kmap() if required */
         LASSERT (kiov == NULL || !in_interrupt ());
@@ -1516,20 +1573,37 @@ kqswnal_recvmsg (nal_cb_t     *nal,
                 page_nob = PAGE_SIZE - KQSW_HDR_SIZE;
 
                 LASSERT (niov > 0);
+                
                 if (kiov != NULL) {
-                        iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
-                        iov_nob = kiov->kiov_len;
+                        /* skip complete frags */
+                        while (offset >= kiov->kiov_len) {
+                                offset -= kiov->kiov_len;
+                                kiov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                        }
+                        iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
+                        iov_nob = kiov->kiov_len - offset;
                 } else {
-                        iov_ptr = iov->iov_base;
-                        iov_nob = iov->iov_len;
+                        /* skip complete frags */
+                        while (offset >= iov->iov_len) {
+                                offset -= iov->iov_len;
+                                iov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                        }
+                        iov_ptr = iov->iov_base + offset;
+                        iov_nob = iov->iov_len - offset;
                 }
-
+                
                 for (;;)
                 {
-                        /* We expect the iov to exactly match mlen */
-                        LASSERT (iov_nob <= mlen);
-                        
-                        frag = MIN (page_nob, iov_nob);
+                        frag = mlen;
+                        if (frag > page_nob)
+                                frag = page_nob;
+                        if (frag > iov_nob)
+                                frag = iov_nob;
+
                         memcpy (iov_ptr, page_ptr, frag);
 #if KQSW_CHECKSUM
                         payload_csum = kqsw_csum (payload_csum, iov_ptr, frag);
@@ -1588,33 +1662,39 @@ kqswnal_recvmsg (nal_cb_t     *nal,
                        "csum_nob %d\n",
                         hdr_csum, payload_csum, csum_frags, csum_nob);
 #endif
-        lib_finalize(nal, private, libmsg);
+        lib_finalize(nal, private, libmsg, PTL_OK);
 
-        return (rlen);
+        return (PTL_OK);
 }
 
-static int
+static ptl_err_t
 kqswnal_recv(nal_cb_t     *nal,
              void         *private,
              lib_msg_t    *libmsg,
              unsigned int  niov,
              struct iovec *iov,
+             size_t        offset,
              size_t        mlen,
              size_t        rlen)
 {
-        return (kqswnal_recvmsg (nal, private, libmsg, niov, iov, NULL, mlen, rlen));
+        return (kqswnal_recvmsg(nal, private, libmsg, 
+                                niov, iov, NULL, 
+                                offset, mlen, rlen));
 }
 
-static int
+static ptl_err_t
 kqswnal_recv_pages (nal_cb_t     *nal,
                     void         *private,
                     lib_msg_t    *libmsg,
                     unsigned int  niov,
                     ptl_kiov_t   *kiov,
+                    size_t        offset,
                     size_t        mlen,
                     size_t        rlen)
 {
-        return (kqswnal_recvmsg (nal, private, libmsg, niov, NULL, kiov, mlen, rlen));
+        return (kqswnal_recvmsg(nal, private, libmsg, 
+                                niov, NULL, kiov, 
+                                offset, mlen, rlen));
 }
 
 int
index b31c2ea..52afb98 100644 (file)
@@ -176,7 +176,8 @@ kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context)
                         break;
         }
 
-        lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie);
+        lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie,
+                     (err == 0) ? PTL_OK : PTL_FAIL);
 
         PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
 }
@@ -225,14 +226,14 @@ kscimacnal_sendmsg(nal_cb_t        *nal,
         if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) {
                 CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n",
                                 mac_get_mtusize(ksci->ksci_machandle));
-                return -EINVAL;
+                return PTL_FAIL;
         }
 
 
         /* save transaction info for later finalize and cleanup */
         PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t)));
         if (!ktx) {
-                return -ENOMEM;
+                return PTL_NOSPACE;
         }
 
         ktx->ktx_nmapped = 0; /* Start with no mapped pages :) */
@@ -247,7 +248,7 @@ kscimacnal_sendmsg(nal_cb_t        *nal,
                         kscimacnal_txrelease, ktx);
         if (!msg) {
                 PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
-                return -ENOMEM;
+                return PTL_NOSPACE;
         }
         mac_put_mblk(msg, sizeof(ptl_hdr_t));
         lastblk=msg;
@@ -284,7 +285,7 @@ kscimacnal_sendmsg(nal_cb_t        *nal,
                 if(!newblk) {
                         mac_free_msg(msg);
                         PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
-                        return -ENOMEM;
+                        return PTL_NOSPACE;
                 }
                 mac_put_mblk(newblk, nob);
                 mac_link_mblk(lastblk, newblk);
@@ -315,10 +316,10 @@ kscimacnal_sendmsg(nal_cb_t        *nal,
                 CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc);
                 mac_free_msg(msg);
                 PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
-                return rc;
+                return PTL_FAIL;
         }
 
-        return 0;
+        return PTL_OK;
 }
 
 
@@ -463,12 +464,15 @@ kscimacnal_recvmsg(nal_cb_t     *nal,
                         krx->msg, mlen, rlen, niov);
 
         /* What was actually received must be >= what sender claims to have
-         * sent.  This is an LASSERT, since lib-move doesn't check cb return
-         * code yet. Also, rlen seems to be negative when mlen==0 so don't
-         * assert on that.
-         */
-        LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen);
-        LASSERT (mlen==0 || mlen <= rlen);
+         * sent. */
+        LASSERT (mlen <= rlen); /* something is wrong if this isn't true */
+        if (mac_msg_size(krx->msg) < sizeof(ptl_hdr_t)+mlen) {
+                /* We didn't receive everything lib thinks we did */
+                CERROR("Bad message size: have %d, need %d + %d\n",
+                       mac_msg_size(krx->msg), sizeof(ptl_hdr_t), mlen);
+                return (PTL_FAIL);
+        }
+
         /* It must be OK to kmap() if required */
         LASSERT (kiov == NULL || !in_interrupt ());
         /* Either all pages or all vaddrs */
@@ -545,12 +549,12 @@ kscimacnal_recvmsg(nal_cb_t     *nal,
         CDEBUG(D_NET, "Calling lib_finalize.\n");
 
         PROF_START(lib_finalize);
-        lib_finalize(nal, private, cookie);
+        lib_finalize(nal, private, cookie, PTL_OK);
         PROF_FINISH(lib_finalize);
 
         CDEBUG(D_NET, "Done.\n");
 
-        return rlen;
+        return PTL_OK;
 }
 
 
index 9ae1c87..c47dcb4 100644 (file)
@@ -993,15 +993,11 @@ ksocknal_destroy_conn (ksock_conn_t *conn)
         /* complete current receive if any */
         switch (conn->ksnc_rx_state) {
         case SOCKNAL_RX_BODY:
-#if 0
-                lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie);
-#else
-                CERROR ("Refusing to complete a partial receive from "
-                        LPX64", ip %d.%d.%d.%d:%d\n", conn->ksnc_peer->ksnp_nid,
-                        HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
-                CERROR ("This may hang communications and "
-                        "prevent modules from unloading\n");
-#endif
+                CERROR("Completing partial receive from "LPX64
+                       ", ip %d.%d.%d.%d:%d, with error\n",
+                       conn->ksnc_peer->ksnp_nid,
+                       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+                lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL);
                 break;
         case SOCKNAL_RX_BODY_FWD:
                 ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED);
index 82d4c64..3ecead1 100644 (file)
@@ -29,7 +29,7 @@
  *  LIB functions follow
  *
  */
-int
+ptl_err_t
 ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
               user_ptr src_addr, size_t len)
 {
@@ -37,10 +37,10 @@ ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
                nal->ni.nid, (long)len, src_addr, dst_addr);
 
         memcpy( dst_addr, src_addr, len );
-        return 0;
+        return PTL_OK;
 }
 
-int
+ptl_err_t
 ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
                void *src_addr, size_t len)
 {
@@ -48,20 +48,7 @@ ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
                nal->ni.nid, (long)len, src_addr, dst_addr);
 
         memcpy( dst_addr, src_addr, len );
-        return 0;
-}
-
-int
-ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
-                         ptl_event_t *ev)
-{
-        CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
-               nal->ni.nid, eq, ev);
-
-        if (eq->event_callback != NULL)
-                eq->event_callback(ev);
-
-        return 0;
+        return PTL_OK;
 }
 
 void *
@@ -617,7 +604,8 @@ ksocknal_tx_done (ksock_tx_t *tx, int asynch)
 
         if (tx->tx_isfwd) {             /* was a forwarded packet? */
                 kpr_fwd_done (&ksocknal_data.ksnd_router,
-                              KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+                              KSOCK_TX_2_KPR_FWD_DESC (tx), 
+                              (tx->tx_resid == 0) ? 0 : -ECONNABORTED);
                 EXIT;
                 return;
         }
@@ -625,7 +613,8 @@ ksocknal_tx_done (ksock_tx_t *tx, int asynch)
         /* local send */
         ltx = KSOCK_TX_2_KSOCK_LTX (tx);
 
-        lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie);
+        lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie,
+                      (tx->tx_resid == 0) ? PTL_OK : PTL_FAIL);
 
         ksocknal_free_ltx (ltx);
         EXIT;
@@ -694,17 +683,17 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
         LASSERT (rc < 0);
 
         if (!conn->ksnc_closing)
-                CERROR ("[%p] Error %d on write to "LPX64
-                        " ip %d.%d.%d.%d:%d\n",conn, rc, 
-                        conn->ksnc_peer->ksnp_nid,
-                        HIPQUAD(conn->ksnc_ipaddr),
-                        conn->ksnc_port);
+                CERROR("[%p] Error %d on write to "LPX64
+                       " ip %d.%d.%d.%d:%d\n", conn, rc,
+                       conn->ksnc_peer->ksnp_nid,
+                       HIPQUAD(conn->ksnc_ipaddr),
+                       conn->ksnc_port);
 
         ksocknal_close_conn_and_siblings (conn, rc);
         ksocknal_tx_launched (tx);
-        
+
         return (rc);
-} 
+}
 
 void
 ksocknal_launch_autoconnect_locked (ksock_route_t *route)
@@ -742,21 +731,21 @@ ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid)
         ptl_nid_t     target_nid;
         int           rc;
         ksock_peer_t *peer = ksocknal_find_peer_locked (nid);
-        
+
         if (peer != NULL)
                 return (peer);
-        
+
         if (tx->tx_isfwd) {
                 CERROR ("Can't send packet to "LPX64
-                        " %s: routed target is not a peer\n", 
+                       " %s: routed target is not a peer\n",
                         nid, portals_nid2str(SOCKNAL, nid, ipbuf));
                 return (NULL);
         }
-        
+
         rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob,
                          &target_nid);
         if (rc != 0) {
-                CERROR ("Can't route to "LPX64" %s: router error %d\n", 
+                CERROR ("Can't route to "LPX64" %s: router error %d\n",
                         nid, portals_nid2str(SOCKNAL, nid, ipbuf), rc);
                 return (NULL);
         }
@@ -1018,7 +1007,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid)
         return (-EHOSTUNREACH);
 }
 
-int
+ptl_err_t
 ksocknal_sendmsg(nal_cb_t     *nal, 
                  void         *private, 
                  lib_msg_t    *cookie,
@@ -1029,6 +1018,7 @@ ksocknal_sendmsg(nal_cb_t     *nal,
                  unsigned int  payload_niov, 
                  struct iovec *payload_iov, 
                  ptl_kiov_t   *payload_kiov,
+                 size_t        payload_offset,
                  size_t        payload_nob)
 {
         ksock_ltx_t  *ltx;
@@ -1091,20 +1081,19 @@ ksocknal_sendmsg(nal_cb_t     *nal,
                 ltx->ltx_tx.tx_kiov  = NULL;
                 ltx->ltx_tx.tx_nkiov = 0;
 
-                ltx->ltx_tx.tx_niov = 1 + payload_niov;
-
-                memcpy(ltx->ltx_iov + 1, payload_iov,
-                       payload_niov * sizeof (*payload_iov));
-
+                ltx->ltx_tx.tx_niov = 
+                        1 + lib_extract_iov(payload_niov, &ltx->ltx_iov[1],
+                                            payload_niov, payload_iov,
+                                            payload_offset, payload_nob);
         } else {
                 /* payload is all pages */
-                ltx->ltx_tx.tx_kiov = ltx->ltx_kiov;
-                ltx->ltx_tx.tx_nkiov = payload_niov;
-
                 ltx->ltx_tx.tx_niov = 1;
 
-                memcpy(ltx->ltx_kiov, payload_kiov, 
-                       payload_niov * sizeof (*payload_kiov));
+                ltx->ltx_tx.tx_kiov = ltx->ltx_kiov;
+                ltx->ltx_tx.tx_nkiov =
+                        lib_extract_kiov(payload_niov, ltx->ltx_kiov,
+                                         payload_niov, payload_kiov,
+                                         payload_offset, payload_nob);
         }
 
         rc = ksocknal_launch_packet(&ltx->ltx_tx, nid);
@@ -1115,28 +1104,28 @@ ksocknal_sendmsg(nal_cb_t     *nal,
         return (PTL_FAIL);
 }
 
-int
+ptl_err_t
 ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie,
                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                unsigned int payload_niov, struct iovec *payload_iov,
-               size_t payload_len)
+               size_t payload_offset, size_t payload_len)
 {
         return (ksocknal_sendmsg(nal, private, cookie,
                                  hdr, type, nid, pid,
                                  payload_niov, payload_iov, NULL,
-                                 payload_len));
+                                 payload_offset, payload_len));
 }
 
-int
+ptl_err_t
 ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
-                     size_t payload_len)
+                     size_t payload_offset, size_t payload_len)
 {
         return (ksocknal_sendmsg(nal, private, cookie,
                                  hdr, type, nid, pid,
                                  payload_niov, NULL, payload_kiov,
-                                 payload_len));
+                                 payload_offset, payload_len));
 }
 
 void
@@ -1208,7 +1197,7 @@ ksocknal_fmb_callback (void *arg, int error)
 
         /* drop peer ref taken on init */
         ksocknal_put_peer (fmb->fmb_peer);
-        
+
         spin_lock_irqsave (&fmp->fmp_lock, flags);
 
         list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
@@ -1591,7 +1580,7 @@ ksocknal_process_receive (ksock_conn_t *conn)
 
         case SOCKNAL_RX_BODY:
                 /* payload all received */
-                lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie);
+                lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_OK);
                 /* Fall through */
 
         case SOCKNAL_RX_SLOP:
@@ -1627,9 +1616,10 @@ ksocknal_process_receive (ksock_conn_t *conn)
         return (-EINVAL);                       /* keep gcc happy */
 }
 
-int
+ptl_err_t
 ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
-               unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+               unsigned int niov, struct iovec *iov, 
+               size_t offset, size_t mlen, size_t rlen)
 {
         ksock_conn_t *conn = (ksock_conn_t *)private;
 
@@ -1642,20 +1632,22 @@ ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
 
         conn->ksnc_rx_nkiov = 0;
         conn->ksnc_rx_kiov = NULL;
-        conn->ksnc_rx_niov = niov;
         conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
-        memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov));
+        conn->ksnc_rx_niov =
+                lib_extract_iov(PTL_MD_MAX_IOV, conn->ksnc_rx_iov,
+                                niov, iov, offset, mlen);
 
         LASSERT (mlen == 
                  lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
                  lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
 
-        return (rlen);
+        return (PTL_OK);
 }
 
-int
+ptl_err_t
 ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
-                     unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen)
+                     unsigned int niov, ptl_kiov_t *kiov, 
+                     size_t offset, size_t mlen, size_t rlen)
 {
         ksock_conn_t *conn = (ksock_conn_t *)private;
 
@@ -1668,15 +1660,16 @@ ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
 
         conn->ksnc_rx_niov = 0;
         conn->ksnc_rx_iov  = NULL;
-        conn->ksnc_rx_nkiov = niov;
         conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
-        memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov));
+        conn->ksnc_rx_nkiov = 
+                lib_extract_kiov(PTL_MD_MAX_IOV, conn->ksnc_rx_kiov,
+                                 niov, kiov, offset, mlen);
 
         LASSERT (mlen == 
                  lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
                  lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
 
-        return (rlen);
+        return (PTL_OK);
 }
 
 int ksocknal_scheduler (void *arg)
@@ -2064,7 +2057,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati
                         rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
                 return (rc);
         }
-        
+
         if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) {
                 CERROR ("Bad magic %#08x (%#08x expected) from "LPX64" %s\n",
                         __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, *nid,
@@ -2118,7 +2111,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati
         } else if (*nid != __le64_to_cpu (hdr.src_nid)) {
                 CERROR ("Connected to nid "LPX64" %s, but expecting "LPX64" %s\n",
                         __le64_to_cpu (hdr.src_nid),
-                        portals_nid2str(SOCKNAL, 
+                        portals_nid2str(SOCKNAL,
                                         __le64_to_cpu(hdr.src_nid),
                                         ipbuf),
                         *nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
@@ -2139,7 +2132,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati
                         *type = SOCKNAL_CONN_BULK_IN;
                         break;
                 default:
-                        CERROR ("Unexpected type %d from "LPX64" %s\n", 
+                        CERROR ("Unexpected type %d from "LPX64" %s\n",
                                 *type, *nid,
                                 portals_nid2str(SOCKNAL, *nid, ipbuf));
                         return (-EPROTO);
@@ -2346,8 +2339,8 @@ ksocknal_connect_peer (ksock_route_t *route, int type)
         if (rc != 0) {
                 CERROR ("Error %d connecting to "LPX64" %s\n", rc,
                         route->ksnr_peer->ksnp_nid,
-                        portals_nid2str(SOCKNAL, 
-                                        route->ksnr_peer->ksnp_nid, 
+                        portals_nid2str(SOCKNAL,
+                                        route->ksnr_peer->ksnp_nid,
                                         ipbuf));
                 goto out;
         }
@@ -2432,7 +2425,7 @@ ksocknal_autoconnect (ksock_route_t *route)
         while (!list_empty (&zombies)) {
                 char ipbuf[PTL_NALFMT_SIZE];
                 tx = list_entry (zombies.next, ksock_tx_t, tx_list);
-                
+
                 CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n",
                         NTOH__u32 (tx->tx_hdr->type),
                         NTOH__u32 (tx->tx_hdr->payload_length),
@@ -2719,7 +2712,6 @@ nal_cb_t ksocknal_lib = {
         cb_recv_pages:   ksocknal_recv_pages,
         cb_read:         ksocknal_read,
         cb_write:        ksocknal_write,
-        cb_callback:     ksocknal_callback,
         cb_malloc:       ksocknal_malloc,
         cb_free:         ksocknal_free,
         cb_printf:       ksocknal_printf,
index 2768c8d..2f5a852 100644 (file)
@@ -812,9 +812,11 @@ EXPORT_SYMBOL(PtlMDBind);
 EXPORT_SYMBOL(lib_iov_nob);
 EXPORT_SYMBOL(lib_copy_iov2buf);
 EXPORT_SYMBOL(lib_copy_buf2iov);
+EXPORT_SYMBOL(lib_extract_iov);
 EXPORT_SYMBOL(lib_kiov_nob);
 EXPORT_SYMBOL(lib_copy_kiov2buf);
 EXPORT_SYMBOL(lib_copy_buf2kiov);
+EXPORT_SYMBOL(lib_extract_kiov);
 EXPORT_SYMBOL(lib_finalize);
 EXPORT_SYMBOL(lib_parse);
 EXPORT_SYMBOL(lib_fake_reply_msg);
index 8c03749..d17db61 100644 (file)
@@ -6,5 +6,9 @@
 
 CPPFLAGS=
 INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include
-lib_LIBRARIES= libportals.a
+noinst_LIBRARIES= libportals.a
 libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c
+
+if LIBLUSTRE
+libportals_a_CFLAGS= -fPIC
+endif
index 9bc9c36..964b9d8 100644 (file)
@@ -81,12 +81,6 @@ int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev)
 
         *ev = *new_event;
 
-        /* Set the unlinked_me interface number if there is one to pass
-         * back, since the NAL hasn't a clue what it is and therefore can't
-         * set it. */
-        if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE))
-                ev->unlinked_me.nal_idx = eventq.nal_idx;
-        
         /* ensure event is delivered correctly despite possible 
            races with lib_finalize */
         if (eq->sequence != new_event->sequence) {
@@ -119,6 +113,7 @@ int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out)
 }
 
 #ifndef __KERNEL__
+#if 0
 static jmp_buf eq_jumpbuf;
 
 static void eq_timeout(int signal)
@@ -162,6 +157,46 @@ int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
 
         return rc;
 }
+#else
+#include <errno.h>
 
-#endif
+/* FIXME
+ * Here timeout need a trick with tcpnal, definitely unclean but OK for
+ * this moment.
+ */
+
+/* global variables defined by tcpnal */
+extern int __tcpnal_eqwait_timeout_value;
+extern int __tcpnal_eqwait_timedout;
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+                      int timeout)
+{
+        int rc;
 
+        if (!timeout)
+                return PtlEQWait(eventq_in, event_out);
+
+        __tcpnal_eqwait_timeout_value = timeout;
+
+        while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) {
+                nal_t *nal = ptl_hndl2nal(&eventq_in);
+                
+                if (nal->yield)
+                        nal->yield(nal);
+
+                if (__tcpnal_eqwait_timedout) {
+                        if (__tcpnal_eqwait_timedout != ETIMEDOUT)
+                                printf("Warning: yield return error %d\n",
+                                        __tcpnal_eqwait_timedout);
+                        rc = PTL_EQ_EMPTY;
+                        break;
+                }
+        }
+
+        __tcpnal_eqwait_timeout_value = 0;
+
+        return rc;
+}
+#endif
+#endif /* __KERNEL__ */
index 026c93b..b5e7aa1 100644 (file)
@@ -50,6 +50,5 @@ const char *ptl_err_str[] = {
         "PTL_IOV_TOO_SMALL",
 
         "PTL_EQ_INUSE",
-        "PTL_MD_INUSE"
 };
 /* If you change these, you must update the number table in portals/errno.h */
index b2e069e..18eea91 100644 (file)
@@ -125,7 +125,7 @@ int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size,
                 if (ptl_interfaces[i] == nal) {
                         nal->refct++;
                         handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i;
-                        fprintf(stderr, "Returning existing NAL (%d)\n", i);
+                        CDEBUG(D_OTHER, "Returning existing NAL (%d)\n", i);
                         ptl_ni_init_mutex_exit ();
                         return PTL_OK;
                 }
index e54707f..d23a6aa 100644 (file)
@@ -32,7 +32,7 @@ static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf,
         nal_t *nal;
 
         if (!ptl_init) {
-                fprintf(stderr, "PtlGetId: Not initialized\n");
+                CERROR("Not initialized\n");
                 return PTL_NOINIT;
         }
 
@@ -262,7 +262,7 @@ static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in)
         int i;
 
         if (!ptl_init) {
-                fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n");
+                CERROR("PtlMDAttach/Bind/Update: Not initialized\n");
                 return PTL_NOINIT;
         }
 
index 0765498..d4d8860 100644 (file)
 # include <sys/time.h>
 #endif
 
-#ifndef PTL_USE_DESC_LISTS
-static int ptl_slab_users;
-
-atomic_t md_in_use_count = ATOMIC_INIT(0);
-atomic_t msg_in_use_count = ATOMIC_INIT(0);
-atomic_t me_in_use_count = ATOMIC_INIT(0);
-atomic_t eq_in_use_count = ATOMIC_INIT(0);
+#ifndef PTL_USE_LIB_FREELIST
 
 int
 kportal_descriptor_setup (nal_cb_t *nal)
 {
-        ptl_slab_users++;
-        RETURN(PTL_OK);
+        return PTL_OK;
 }
 
 void
 kportal_descriptor_cleanup (nal_cb_t *nal)
 {
-        if (--ptl_slab_users != 0)
-                return;
-
-        LASSERT (atomic_read (&md_in_use_count) == 0);
-        LASSERT (atomic_read (&me_in_use_count) == 0);
-        LASSERT (atomic_read (&eq_in_use_count) == 0);
-        LASSERT (atomic_read (&msg_in_use_count) == 0);
 }
 #else
 
index be6949c..a1ed583 100644 (file)
@@ -83,7 +83,7 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private,
         int           rc;
         int           i;
 
-        /* NB we are passes an allocated, but uninitialised/active md.
+        /* NB we are passed an allocated, but uninitialised/active md.
          * if we return success, caller may lib_md_unlink() it.
          * otherwise caller may only lib_md_free() it.
          */
@@ -94,9 +94,10 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private,
                         return PTL_INV_EQ;
         }
 
-        if ((md->options & PTL_MD_IOV) != 0 &&  /* discontiguous MD */
-            md->niov > PTL_MD_MAX_IOV)          /* too many fragments */
-                return PTL_IOV_TOO_MANY;
+        /* Must check this _before_ allocation.  Also, note that non-iov
+         * MDs must set md_niov to 0. */
+        LASSERT((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0 ||
+                md->niov <= PTL_MD_MAX_IOV);
 
         if ((md->options & max_size_opts) != 0 && /* max size used */
             (md->max_size < 0 || md->max_size > md->length)) // illegal max_size
@@ -239,7 +240,11 @@ int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
         lib_md_t *md;
         unsigned long flags;
 
-        md = lib_md_alloc (nal);
+        if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 &&
+            args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */
+                return (ret->rc = PTL_IOV_TOO_MANY);
+
+        md = lib_md_alloc(nal, &args->md_in);
         if (md == NULL)
                 return (ret->rc = PTL_NOSPACE);
 
@@ -287,7 +292,11 @@ int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
         lib_md_t *md;
         unsigned long flags;
 
-        md = lib_md_alloc (nal);
+        if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 &&
+            args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */
+                return (ret->rc = PTL_IOV_TOO_MANY);
+
+        md = lib_md_alloc(nal, &args->md_in);
         if (md == NULL)
                 return (ret->rc = PTL_NOSPACE);
 
@@ -311,34 +320,43 @@ int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
 
 int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
 {
-        PtlMDUnlink_in *args = v_args;
+        PtlMDUnlink_in  *args = v_args;
         PtlMDUnlink_out *ret = v_ret;
-
-        lib_md_t *md;
-        unsigned long flags;
+        ptl_event_t      ev;
+        lib_md_t        *md;
+        unsigned long    flags;
 
         state_lock(nal, &flags);
 
         md = ptl_handle2md(&args->md_in, nal);
         if (md == NULL) {
-                ret->rc = PTL_INV_MD;
-        } else if (md->pending != 0) {           /* being filled/spilled */
-                ret->rc = PTL_MD_INUSE;
-        } else {
-                /* Callers attempting to unlink a busy MD which will get
-                 * unlinked once the net op completes should see INUSE,
-                 * before completion and INV_MD thereafter.  LASSERT we've
-                 * got that right... */
-                LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0);
-
-                lib_md_deconstruct(nal, md, &ret->status_out);
-                lib_md_unlink(nal, md);
-                ret->rc = PTL_OK;
+                state_unlock(nal, &flags);
+                return (ret->rc = PTL_INV_MD);
+        }
+
+        /* If the MD is busy, lib_md_unlink just marks it for deletion, and
+         * when the NAL is done, the completion event flags that the MD was
+         * unlinked.  Otherwise, we enqueue an event now... */
+
+        if (md->eq != NULL &&
+            md->pending == 0) {
+                memset(&ev, 0, sizeof(ev));
+
+                ev.type = PTL_EVENT_UNLINK;
+                ev.status = PTL_OK;
+                ev.unlinked = 1;
+                lib_md_deconstruct(nal, md, &ev.mem_desc);
+                
+                lib_enq_event_locked(nal, private, md->eq, &ev);
         }
 
+        lib_md_deconstruct(nal, md, &ret->status_out);
+        lib_md_unlink(nal, md);
+        ret->rc = PTL_OK;
+
         state_unlock(nal, &flags);
 
-        return (ret->rc);
+        return (PTL_OK);
 }
 
 int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
@@ -379,6 +397,23 @@ int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
                 goto out;
         }
 
+        /* XXX fttb, the new MD must be the same type wrt fragmentation */
+        if (((new->options ^ md->options) & 
+             (PTL_MD_IOV | PTL_MD_KIOV)) != 0) {
+                ret->rc = PTL_INV_MD;
+                goto out;
+        }
+
+        if (new->niov > md->md_niov) {
+                ret->rc = PTL_IOV_TOO_MANY;
+                goto out;
+        } 
+
+        if (new->niov < md->md_niov) {
+                ret->rc = PTL_IOV_TOO_SMALL;
+                goto out;
+        }
+
         if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) {
                 test_eq = ptl_handle2eq(&args->testq_in, nal);
                 if (test_eq == NULL) {
index d844a7a..ecd543c 100644 (file)
@@ -258,55 +258,78 @@ lib_iov_nob (int niov, struct iovec *iov)
 }
 
 void
-lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len)
+lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, 
+                  ptl_size_t offset, ptl_size_t len)
 {
         ptl_size_t nob;
 
-        while (len > 0)
-        {
+        if (len == 0)
+                return;
+        
+        /* skip complete frags before 'offset' */
+        LASSERT (niov > 0);
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                iov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+                
+        do {
                 LASSERT (niov > 0);
-                nob = MIN (iov->iov_len, len);
-                memcpy (dest, iov->iov_base, nob);
+                nob = MIN (iov->iov_len - offset, len);
+                memcpy (dest, iov->iov_base + offset, nob);
 
                 len -= nob;
                 dest += nob;
                 niov--;
                 iov++;
-        }
+                offset = 0;
+        } while (len > 0);
 }
 
 void
-lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len)
+lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, 
+                  char *src, ptl_size_t len)
 {
         ptl_size_t nob;
 
-        while (len > 0)
-        {
+        if (len == 0)
+                return;
+
+        /* skip complete frags before 'offset' */
+        LASSERT (niov > 0);
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                iov++;
+                niov--;
                 LASSERT (niov > 0);
-                nob = MIN (iov->iov_len, len);
-                memcpy (iov->iov_base, src, nob);
+        }
+        
+        do {
+                LASSERT (niov > 0);
+                nob = MIN (iov->iov_len - offset, len);
+                memcpy (iov->iov_base + offset, src, nob);
                 
                 len -= nob;
                 src += nob;
                 niov--;
                 iov++;
-        }
+                offset = 0;
+        } while (len > 0);
 }
 
-static int
-lib_extract_iov (struct iovec *dst, lib_md_t *md,
+int
+lib_extract_iov (int dst_niov, struct iovec *dst,
+                 int src_niov, struct iovec *src,
                  ptl_size_t offset, ptl_size_t len)
 {
         /* Initialise 'dst' to the subset of 'src' starting at 'offset',
          * for exactly 'len' bytes, and return the number of entries.
          * NB not destructive to 'src' */
-        int             src_niov = md->md_niov;  
-        struct iovec   *src = md->md_iov.iov;
         ptl_size_t      frag_len;
-        int             dst_niov;
+        int             niov;
 
-        LASSERT (offset + len <= md->length);
-        
         if (len == 0)                           /* no data => */
                 return (0);                     /* no frags */
 
@@ -318,17 +341,17 @@ lib_extract_iov (struct iovec *dst, lib_md_t *md,
                 LASSERT (src_niov > 0);
         }
 
-        dst_niov = 1;
+        niov = 1;
         for (;;) {
                 LASSERT (src_niov > 0);
-                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                LASSERT (niov <= dst_niov);
                 
                 frag_len = src->iov_len - offset;
                 dst->iov_base = ((char *)src->iov_base) + offset;
 
                 if (len <= frag_len) {
                         dst->iov_len = len;
-                        return (dst_niov);
+                        return (niov);
                 }
                 
                 dst->iov_len = frag_len;
@@ -336,7 +359,7 @@ lib_extract_iov (struct iovec *dst, lib_md_t *md,
                 len -= frag_len;
                 dst++;
                 src++;
-                dst_niov++;
+                niov++;
                 src_niov--;
                 offset = 0;
         }
@@ -351,19 +374,22 @@ lib_kiov_nob (int niov, ptl_kiov_t *kiov)
 }
 
 void
-lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, 
+                   ptl_size_t offset, ptl_size_t len)
 {
         LASSERT (0);
 }
 
 void
-lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len)
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+                   char *src, ptl_size_t len)
 {
         LASSERT (0);
 }
 
-static int
-lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+int
+lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, 
+                  int src_niov, ptl_kiov_t *src,
                   ptl_size_t offset, ptl_size_t len)
 {
         LASSERT (0);
@@ -383,18 +409,30 @@ lib_kiov_nob (int niov, ptl_kiov_t *kiov)
 }
 
 void
-lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, 
+                   ptl_size_t offset, ptl_size_t len)
 {
         ptl_size_t  nob;
         char       *addr;
+
+        if (len == 0)
+                return;
         
         LASSERT (!in_interrupt ());
-        while (len > 0)
-        {
+
+        LASSERT (niov > 0);
+        while (offset > kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                kiov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+        
+        do{
                 LASSERT (niov > 0);
-                nob = MIN (kiov->kiov_len, len);
+                nob = MIN (kiov->kiov_len - offset, len);
                 
-                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
                 memcpy (dest, addr, nob);
                 kunmap (kiov->kiov_page);
                 
@@ -402,22 +440,35 @@ lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
                 dest += nob;
                 niov--;
                 kiov++;
-        }
+                offset = 0;
+        } while (len > 0);
 }
 
 void
-lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len)
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+                   char *src, ptl_size_t len)
 {
         ptl_size_t  nob;
         char       *addr;
 
+        if (len == 0)
+                return;
+
         LASSERT (!in_interrupt ());
-        while (len > 0)
-        {
+
+        LASSERT (niov > 0);
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                kiov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+        
+        do {
                 LASSERT (niov > 0);
-                nob = MIN (kiov->kiov_len, len);
+                nob = MIN (kiov->kiov_len - offset, len);
                 
-                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
                 memcpy (addr, src, nob);
                 kunmap (kiov->kiov_page);
                 
@@ -425,23 +476,21 @@ lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len)
                 src += nob;
                 niov--;
                 kiov++;
-        }
+                offset = 0;
+        } while (len > 0);
 }
 
-static int
-lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+int
+lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, 
+                  int src_niov, ptl_kiov_t *src,
                   ptl_size_t offset, ptl_size_t len)
 {
         /* Initialise 'dst' to the subset of 'src' starting at 'offset',
          * for exactly 'len' bytes, and return the number of entries.
          * NB not destructive to 'src' */
-        int             src_niov = md->md_niov;  
-        ptl_kiov_t     *src = md->md_iov.kiov;
         ptl_size_t      frag_len;
-        int             dst_niov;
+        int             niov;
 
-        LASSERT (offset + len <= md->length);
-        
         if (len == 0)                           /* no data => */
                 return (0);                     /* no frags */
 
@@ -453,10 +502,10 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
                 LASSERT (src_niov > 0);
         }
 
-        dst_niov = 1;
+        niov = 1;
         for (;;) {
                 LASSERT (src_niov > 0);
-                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                LASSERT (niov <= dst_niov);
                 
                 frag_len = src->kiov_len - offset;
                 dst->kiov_page = src->kiov_page;
@@ -465,7 +514,7 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
                 if (len <= frag_len) {
                         dst->kiov_len = len;
                         LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
-                        return (dst_niov);
+                        return (niov);
                 }
 
                 dst->kiov_len = frag_len;
@@ -474,73 +523,66 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
                 len -= frag_len;
                 dst++;
                 src++;
-                dst_niov++;
+                niov++;
                 src_niov--;
                 offset = 0;
         }
 }
 #endif
 
-void
+ptl_err_t
 lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
           ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
 {
-        int   niov;
-
         if (mlen == 0)
-                nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen);
-        else if ((md->options & PTL_MD_KIOV) == 0) {
-                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen);
-                nal->cb_recv (nal, private, msg,
-                              niov, msg->msg_iov.iov, mlen, rlen);
-        } else {
-                niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen);
-                nal->cb_recv_pages (nal, private, msg, 
-                                    niov, msg->msg_iov.kiov, mlen, rlen);
-        }
+                return (nal->cb_recv(nal, private, msg,
+                                     0, NULL,
+                                     offset, mlen, rlen));
+
+        if ((md->options & PTL_MD_KIOV) == 0)
+                return (nal->cb_recv(nal, private, msg,
+                                     md->md_niov, md->md_iov.iov, 
+                                     offset, mlen, rlen));
+
+        return (nal->cb_recv_pages(nal, private, msg, 
+                                   md->md_niov, md->md_iov.kiov,
+                                   offset, mlen, rlen));
 }
 
-int
+ptl_err_t
 lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
           ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
           lib_md_t *md, ptl_size_t offset, ptl_size_t len) 
 {
-        int   niov;
-
         if (len == 0)
-                return (nal->cb_send (nal, private, msg, 
-                                      hdr, type, nid, pid,
-                                      0, NULL, 0));
+                return (nal->cb_send(nal, private, msg,
+                                     hdr, type, nid, pid,
+                                     0, NULL,
+                                     offset, len));
         
-        if ((md->options & PTL_MD_KIOV) == 0) {
-                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len);
-                return (nal->cb_send (nal, private, msg, 
-                                      hdr, type, nid, pid,
-                                      niov, msg->msg_iov.iov, len));
-        }
-
-        niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len);
-        return (nal->cb_send_pages (nal, private, msg, 
-                                    hdr, type, nid, pid,
-                                    niov, msg->msg_iov.kiov, len));
+        if ((md->options & PTL_MD_KIOV) == 0)
+                return (nal->cb_send(nal, private, msg, 
+                                     hdr, type, nid, pid,
+                                     md->md_niov, md->md_iov.iov,
+                                     offset, len));
+
+        return (nal->cb_send_pages(nal, private, msg, 
+                                   hdr, type, nid, pid,
+                                   md->md_niov, md->md_iov.kiov,
+                                   offset, len));
 }
 
-static lib_msg_t *
-get_new_msg (nal_cb_t *nal, lib_md_t *md)
+static void
+lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg)
 {
         /* ALWAYS called holding the state_lock */
         lib_counters_t *counters = &nal->ni.counters;
-        lib_msg_t      *msg      = lib_msg_alloc (nal);
-
-        if (msg == NULL)
-                return (NULL);
-
-        memset (msg, 0, sizeof (*msg));
-
-        msg->send_ack = 0;
 
+        /* Here, we commit the MD to a network OP by marking it busy and
+         * decrementing its threshold.  Come what may, the network "owns"
+         * the MD until a call to lib_finalize() signals completion. */
         msg->md = md;
-        do_gettimeofday(&msg->ev.arrival_time);
+         
         md->pending++;
         if (md->threshold != PTL_MD_THRESH_INF) {
                 LASSERT (md->threshold > 0);
@@ -552,8 +594,24 @@ get_new_msg (nal_cb_t *nal, lib_md_t *md)
                 counters->msgs_max = counters->msgs_alloc;
 
         list_add (&msg->msg_list, &nal->ni.ni_active_msgs);
+}
 
-        return (msg);
+static void
+lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr)
+{
+        unsigned long flags;
+
+        /* CAVEAT EMPTOR: this only drops messages that we've not committed
+         * to receive (init_msg() not called) and therefore can't cause an
+         * event. */
+        
+        state_lock(nal, &flags);
+        nal->ni.counters.drop_count++;
+        nal->ni.counters.drop_length += hdr->payload_length;
+        state_unlock(nal, &flags);
+
+        /* NULL msg => if NAL calls lib_finalize it will be a noop */
+        (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
 }
 
 /*
@@ -563,17 +621,18 @@ get_new_msg (nal_cb_t *nal, lib_md_t *md)
  * of long messages.
  *
  */
-static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
         lib_ni_t        *ni = &nal->ni;
         ptl_size_t       mlength = 0;
         ptl_size_t       offset = 0;
         int              unlink = 0;
+        ptl_err_t        rc;
         lib_me_t        *me;
         lib_md_t        *md;
-        lib_msg_t       *msg;
         unsigned long    flags;
-
+                
         /* Convert put fields to host byte order */
         hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits);
         hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index);
@@ -586,8 +645,10 @@ static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                          hdr->payload_length, hdr->msg.put.offset,
                          hdr->msg.put.match_bits,
                          &mlength, &offset, &unlink);
-        if (me == NULL)
-                goto drop;
+        if (me == NULL) {
+                state_unlock(nal, &flags);
+                return (PTL_FAIL);
+        }
 
         md = me->md;
         CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d "
@@ -595,69 +656,46 @@ static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, 
                md->md_lh.lh_cookie, md->md_niov, offset);
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n",
-                       ni->nid, hdr->src_nid);
-                goto drop;
-        }
+        lib_commit_md(nal, md, msg);
+
+        msg->ev.type = PTL_EVENT_PUT;
+        msg->ev.initiator.nid = hdr->src_nid;
+        msg->ev.initiator.pid = hdr->src_pid;
+        msg->ev.portal = hdr->msg.put.ptl_index;
+        msg->ev.match_bits = hdr->msg.put.match_bits;
+        msg->ev.rlength = hdr->payload_length;
+        msg->ev.mlength = mlength;
+        msg->ev.offset = offset;
+        msg->ev.hdr_data = hdr->msg.put.hdr_data;
+
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
 
         if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
             !(md->options & PTL_MD_ACK_DISABLE)) {
-                msg->send_ack = 1;
                 msg->ack_wmd = hdr->msg.put.ack_wmd;
-                msg->nid = hdr->src_nid;
-                msg->pid = hdr->src_pid;
-                msg->ev.match_bits = hdr->msg.put.match_bits;
-        }
-
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_PUT;
-                msg->ev.initiator.nid = hdr->src_nid;
-                msg->ev.initiator.pid = hdr->src_pid;
-                msg->ev.portal = hdr->msg.put.ptl_index;
-                msg->ev.match_bits = hdr->msg.put.match_bits;
-                msg->ev.rlength = hdr->payload_length;
-                msg->ev.mlength = mlength;
-                msg->ev.offset = offset;
-                msg->ev.hdr_data = hdr->msg.put.hdr_data;
-
-                /* NB if this match has exhausted the MD, we can't be sure
-                 * that this event will the the last one associated with
-                 * this MD in the event queue (another message already
-                 * matching this ME/MD could end up being last).  So we
-                 * remember the ME handle anyway and check again when we're
-                 * allocating our slot in the event queue.
-                 */
-                ptl_me2handle (&msg->ev.unlinked_me, me);
-
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
         }
 
         ni->counters.recv_count++;
         ni->counters.recv_length += mlength;
 
-        /* only unlink after MD's pending count has been bumped
-         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
-        if (unlink) {
-                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+        /* only unlink after MD's pending count has been bumped in
+         * lib_commit_md() otherwise lib_me_unlink() will nuke it */
+        if (unlink)
                 lib_me_unlink (nal, me);
-        }
 
         state_unlock(nal, &flags);
 
-        lib_recv (nal, private, msg, md, offset, mlength, hdr->payload_length);
-        return 0;
+        rc = lib_recv(nal, private, msg, md, offset, mlength,
+                      hdr->payload_length);
+        if (rc != PTL_OK)
+                CERROR(LPU64": error on receiving PUT from "LPU64": %d\n",
+                       ni->nid, hdr->src_nid, rc);
 
- drop:
-        nal->ni.counters.drop_count++;
-        nal->ni.counters.drop_length += hdr->payload_length;
-        state_unlock (nal, &flags);
-        lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-        return -1;
+        return (rc);
 }
 
-static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
         lib_ni_t        *ni = &nal->ni;
         ptl_size_t       mlength = 0;
@@ -665,7 +703,6 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
         int              unlink = 0;
         lib_me_t        *me;
         lib_md_t        *md;
-        lib_msg_t       *msg;
         ptl_hdr_t        reply;
         unsigned long    flags;
         int              rc;
@@ -683,8 +720,10 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                          hdr->msg.get.sink_length, hdr->msg.get.src_offset,
                          hdr->msg.get.match_bits,
                          &mlength, &offset, &unlink);
-        if (me == NULL)
-                goto drop;
+        if (me == NULL) {
+                state_unlock(nal, &flags);
+                return (PTL_FAIL);
+        }
 
         md = me->md;
         CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d "
@@ -692,45 +731,27 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, 
                md->md_lh.lh_cookie, md->md_niov, offset);
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n",
-                       ni->nid, hdr->src_nid);
-                goto drop;
-        }
+        lib_commit_md(nal, md, msg);
 
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_GET;
-                msg->ev.initiator.nid = hdr->src_nid;
-                msg->ev.initiator.pid = hdr->src_pid;
-                msg->ev.portal = hdr->msg.get.ptl_index;
-                msg->ev.match_bits = hdr->msg.get.match_bits;
-                msg->ev.rlength = hdr->payload_length;
-                msg->ev.mlength = mlength;
-                msg->ev.offset = offset;
-                msg->ev.hdr_data = 0;
-
-                /* NB if this match has exhausted the MD, we can't be sure
-                 * that this event will the the last one associated with
-                 * this MD in the event queue (another message already
-                 * matching this ME/MD could end up being last).  So we
-                 * remember the ME handle anyway and check again when we're
-                 * allocating our slot in the event queue.
-                 */
-                ptl_me2handle (&msg->ev.unlinked_me, me);
-
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-        }
+        msg->ev.type = PTL_EVENT_GET;
+        msg->ev.initiator.nid = hdr->src_nid;
+        msg->ev.initiator.pid = hdr->src_pid;
+        msg->ev.portal = hdr->msg.get.ptl_index;
+        msg->ev.match_bits = hdr->msg.get.match_bits;
+        msg->ev.rlength = hdr->payload_length;
+        msg->ev.mlength = mlength;
+        msg->ev.offset = offset;
+        msg->ev.hdr_data = 0;
+
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
 
         ni->counters.send_count++;
         ni->counters.send_length += mlength;
 
-        /* only unlink after MD's refcount has been bumped
-         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
-        if (unlink) {
-                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+        /* only unlink after MD's refcount has been bumped in
+         * lib_commit_md() otherwise lib_me_unlink() will nuke it */
+        if (unlink)
                 lib_me_unlink (nal, me);
-        }
 
         state_unlock(nal, &flags);
 
@@ -749,36 +770,25 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
 
         rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, 
                        hdr->src_nid, hdr->src_pid, md, offset, mlength);
-        if (rc != PTL_OK) {
-                CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n",
-                       ni->nid, hdr->src_nid);
-                /* Hmm, this will create a GET event and make believe
-                 * the reply completed, which it kind of did, only the
-                 * source won't get her reply */
-                lib_finalize (nal, private, msg);
-                state_lock (nal, &flags);
-                goto drop;
-        }
+        if (rc != PTL_OK)
+                CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n",
+                       ni->nid, hdr->src_nid, rc);
+
+        /* Discard any junk after the hdr */
+        (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
 
-        /* Complete the incoming message */
-        lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
         return (rc);
- drop:
-        ni->counters.drop_count++;
-        ni->counters.drop_length += hdr->msg.get.sink_length;
-        state_unlock(nal, &flags);
-        lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-        return -1;
 }
 
-static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
         lib_ni_t        *ni = &nal->ni;
         lib_md_t        *md;
         int              rlength;
         int              length;
-        lib_msg_t       *msg;
         unsigned long    flags;
+        ptl_err_t        rc;
 
         state_lock(nal, &flags);
 
@@ -790,7 +800,9 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                         md == NULL ? "invalid" : "inactive",
                         hdr->msg.reply.dst_wmd.wh_interface_cookie,
                         hdr->msg.reply.dst_wmd.wh_object_cookie);
-                goto drop;
+
+                state_unlock(nal, &flags);
+                return (PTL_FAIL);
         }
 
         LASSERT (md->offset == 0);
@@ -804,7 +816,8 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                                 ni->nid, hdr->src_nid, length,
                                 hdr->msg.reply.dst_wmd.wh_object_cookie,
                                 md->length);
-                        goto drop;
+                        state_unlock(nal, &flags);
+                        return (PTL_FAIL);
                 }
                 length = md->length;
         }
@@ -813,46 +826,36 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                hdr->src_nid, length, rlength, 
                hdr->msg.reply.dst_wmd.wh_object_cookie);
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR(LPU64": Dropping REPLY from "LPU64": can't "
-                       "allocate msg\n", ni->nid, hdr->src_nid);
-                goto drop;
-        }
+        lib_commit_md(nal, md, msg);
 
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_REPLY;
-                msg->ev.initiator.nid = hdr->src_nid;
-                msg->ev.initiator.pid = hdr->src_pid;
-                msg->ev.rlength = rlength;
-                msg->ev.mlength = length;
-                msg->ev.offset = 0;
+        msg->ev.type = PTL_EVENT_REPLY;
+        msg->ev.initiator.nid = hdr->src_nid;
+        msg->ev.initiator.pid = hdr->src_pid;
+        msg->ev.rlength = rlength;
+        msg->ev.mlength = length;
+        msg->ev.offset = 0;
 
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-        }
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
 
         ni->counters.recv_count++;
         ni->counters.recv_length += length;
 
         state_unlock(nal, &flags);
 
-        lib_recv (nal, private, msg, md, 0, length, rlength);
-        return 0;
+        rc = lib_recv(nal, private, msg, md, 0, length, rlength);
+        if (rc != PTL_OK)
+                CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n",
+                       ni->nid, hdr->src_nid, rc);
 
- drop:
-        nal->ni.counters.drop_count++;
-        nal->ni.counters.drop_length += hdr->payload_length;
-        state_unlock (nal, &flags);
-        lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-        return -1;
+        return (rc);
 }
 
-static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
-        lib_ni_t *ni = &nal->ni;
-        lib_md_t *md;
-        lib_msg_t *msg = NULL;
-        unsigned long flags;
+        lib_ni_t      *ni = &nal->ni;
+        lib_md_t      *md;
+        unsigned long  flags;
 
         /* Convert ack fields to host byte order */
         hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits);
@@ -868,40 +871,37 @@ static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                        (md == NULL) ? "invalid" : "inactive",
                        hdr->msg.ack.dst_wmd.wh_interface_cookie,
                        hdr->msg.ack.dst_wmd.wh_object_cookie);
-                goto drop;
+
+                state_unlock(nal, &flags);
+                return (PTL_FAIL);
         }
 
         CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n",
                ni->nid, hdr->src_nid, 
                hdr->msg.ack.dst_wmd.wh_object_cookie);
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n",
-                       ni->nid, hdr->src_nid);
-                goto drop;
-        }
+        lib_commit_md(nal, md, msg);
 
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_ACK;
-                msg->ev.initiator.nid = hdr->src_nid;
-                msg->ev.initiator.pid = hdr->src_pid;
-                msg->ev.mlength = hdr->msg.ack.mlength;
-                msg->ev.match_bits = hdr->msg.ack.match_bits;
+        msg->ev.type = PTL_EVENT_ACK;
+        msg->ev.initiator.nid = hdr->src_nid;
+        msg->ev.initiator.pid = hdr->src_pid;
+        msg->ev.mlength = hdr->msg.ack.mlength;
+        msg->ev.match_bits = hdr->msg.ack.match_bits;
 
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-        }
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
 
         ni->counters.recv_count++;
-        state_unlock(nal, &flags);
-        lib_recv (nal, private, msg, NULL, 0, 0, hdr->payload_length);
-        return 0;
 
- drop:
-        nal->ni.counters.drop_count++;
-        state_unlock (nal, &flags);
-        lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-        return -1;
+        state_unlock(nal, &flags);
+        
+        /* We have received and matched up the ack OK, create the
+         * completion event now... */
+        lib_finalize(nal, private, msg, PTL_OK);
+
+        /* ...and now discard any junk after the hdr */
+        (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
+       return (PTL_OK);
 }
 
 static char *
@@ -983,10 +983,13 @@ void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr)
 }                               /* end of print_hdr() */
 
 
-int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+void 
+lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private)
 {
         unsigned long  flags;
-
+        ptl_err_t      rc;
+        lib_msg_t     *msg;
+        
         /* convert common fields to host byte order */
         hdr->dest_nid = NTOH__u64 (hdr->dest_nid);
         hdr->src_nid = NTOH__u64 (hdr->src_nid);
@@ -1008,22 +1011,16 @@ int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                         nal->ni.nid, mv->magic, 
                         mv->version_major, mv->version_minor,
                         hdr->src_nid);
-                lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-                return (-1);
+                lib_drop_message(nal, private, hdr);
+                return;
         }
         
         if (hdr->dest_nid != nal->ni.nid) {
                 CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64
                        " (not me)\n", nal->ni.nid, hdr_type_string (hdr),
                        hdr->src_nid, hdr->dest_nid);
-
-                state_lock (nal, &flags);
-                nal->ni.counters.drop_count++;
-                nal->ni.counters.drop_length += hdr->payload_length;
-                state_unlock (nal, &flags);
-
-                lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-                return (-1);
+                lib_drop_message(nal, private, hdr);
+                return;
         }
 
         if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
@@ -1033,34 +1030,59 @@ int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
                        ": simulated failure\n",
                        nal->ni.nid, hdr_type_string (hdr), 
                        hdr->src_nid);
-                lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-                return (-1);
+                lib_drop_message(nal, private, hdr);
+                return;
         }
-        
+
+        msg = lib_msg_alloc(nal);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping incoming %s from "LPU64
+                       ": can't allocate a lib_msg_t\n",
+                       nal->ni.nid, hdr_type_string (hdr), 
+                       hdr->src_nid);
+                lib_drop_message(nal, private, hdr);
+                return;
+        }
+
+        do_gettimeofday(&msg->ev.arrival_time);
+
         switch (hdr->type) {
         case PTL_MSG_ACK:
-                return (parse_ack(nal, hdr, private));
+                rc = parse_ack(nal, hdr, private, msg);
+                break;
         case PTL_MSG_PUT:
-                return (parse_put(nal, hdr, private));
+                rc = parse_put(nal, hdr, private, msg);
                 break;
         case PTL_MSG_GET:
-                return (parse_get(nal, hdr, private));
+                rc = parse_get(nal, hdr, private, msg);
                 break;
         case PTL_MSG_REPLY:
-                return (parse_reply(nal, hdr, private));
+                rc = parse_reply(nal, hdr, private, msg);
                 break;
         default:
                 CERROR(LPU64": Dropping <unknown> message from "LPU64
                        ": Bad type=0x%x\n",  nal->ni.nid, hdr->src_nid,
                        hdr->type);
-
-                lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
-                return (-1);
+                rc = PTL_FAIL;
+                break;
+        }
+                
+        if (rc != PTL_OK) {
+                if (msg->md != NULL) {
+                        /* committed... */
+                        lib_finalize(nal, private, msg, rc);
+                } else {
+                        state_lock(nal, &flags);
+                        lib_msg_free(nal, msg); /* expects state_lock held */
+                        state_unlock(nal, &flags);
+
+                        lib_drop_message(nal, private, hdr);
+                }
         }
 }
 
-
-int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int 
+do_PtlPut(nal_cb_t *nal, void *private, void *v_args, void *v_ret)
 {
         /*
          * Incoming:
@@ -1075,16 +1097,15 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
          * Outgoing:
          */
 
-        PtlPut_in *args = v_args;
-        PtlPut_out *ret = v_ret;
-        ptl_hdr_t hdr;
-
-        lib_ni_t *ni = &nal->ni;
-        lib_md_t *md;
-        lib_msg_t *msg = NULL;
+        PtlPut_in        *args = v_args;
         ptl_process_id_t *id = &args->target_in;
-        unsigned long flags;
-        int           rc;
+        PtlPut_out       *ret = v_ret;
+        lib_ni_t         *ni = &nal->ni;
+        lib_msg_t        *msg;
+        ptl_hdr_t         hdr;
+        lib_md_t         *md;
+        unsigned long     flags;
+        int               rc;
         
         if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
             fail_peer (nal, id->nid, 1))           /* shall we now? */
@@ -1093,13 +1114,22 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
                        nal->ni.nid, id->nid);
                 return (ret->rc = PTL_INV_PROC);
         }
-        
-        ret->rc = PTL_OK;
+
+        msg = lib_msg_alloc(nal);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n",
+                       ni->nid, id->nid);
+                return (ret->rc = PTL_NOSPACE);
+        }
+
         state_lock(nal, &flags);
+
         md = ptl_handle2md(&args->md_in, nal);
-        if (md == NULL || !md->threshold) {
+        if (md == NULL || md->threshold == 0) {
+                lib_msg_free(nal, msg);
                 state_unlock(nal, &flags);
-                return ret->rc = PTL_INV_MD;
+        
+                return (ret->rc = PTL_INV_MD);
         }
 
         CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid,
@@ -1126,57 +1156,39 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
         hdr.msg.put.offset = HTON__u32 (args->offset_in);
         hdr.msg.put.hdr_data = args->hdr_data_in;
 
+        lib_commit_md(nal, md, msg);
+        
+        msg->ev.type = PTL_EVENT_SENT;
+        msg->ev.initiator.nid = ni->nid;
+        msg->ev.initiator.pid = ni->pid;
+        msg->ev.portal = args->portal_in;
+        msg->ev.match_bits = args->match_bits_in;
+        msg->ev.rlength = md->length;
+        msg->ev.mlength = md->length;
+        msg->ev.offset = args->offset_in;
+        msg->ev.hdr_data = args->hdr_data_in;
+
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+
         ni->counters.send_count++;
         ni->counters.send_length += md->length;
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR("BAD: could not allocate msg!\n");
-                state_unlock(nal, &flags);
-                return ret->rc = PTL_NOSPACE;
-        }
-
-        /*
-         * If this memory descriptor has an event queue associated with
-         * it we need to allocate a message state object and record the
-         * information about this operation that will be recorded into
-         * event queue once the message has been completed.
-         *
-         * NB. We're now committed to the GET, since we just marked the MD
-         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
-         * PtlMDUnlink()) expect a completion event to tell them when the
-         * MD becomes idle. 
-         */
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_SENT;
-                msg->ev.initiator.nid = ni->nid;
-                msg->ev.initiator.pid = ni->pid;
-                msg->ev.portal = args->portal_in;
-                msg->ev.match_bits = args->match_bits_in;
-                msg->ev.rlength = md->length;
-                msg->ev.mlength = md->length;
-                msg->ev.offset = args->offset_in;
-                msg->ev.hdr_data = args->hdr_data_in;
-
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-        }
-
         state_unlock(nal, &flags);
         
         rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT,
                        id->nid, id->pid, md, 0, md->length);
         if (rc != PTL_OK) {
-                /* get_new_msg() committed us to sending by decrementing
-                 * md->threshold, so we have to act like we did send, but
-                 * the network dropped it. */
-                lib_finalize (nal, private, msg);
+                CERROR(LPU64": error sending PUT to "LPU64": %d\n",
+                       ni->nid, id->nid, rc);
+                lib_finalize (nal, private, msg, rc);
         }
         
+        /* completion will be signalled by an event */
         return ret->rc = PTL_OK;
 }
 
-lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, 
-                                lib_md_t *getmd)
+lib_msg_t * 
+lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd)
 {
         /* The NAL can DMA direct to the GET md (i.e. no REPLY msg).  This
          * returns a msg the NAL can pass to lib_finalize() so that a REPLY
@@ -1188,39 +1200,38 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
          * lib_finalize() of the original GET. */
 
         lib_ni_t        *ni = &nal->ni;
-        lib_msg_t       *msg;
+        lib_msg_t       *msg = lib_msg_alloc(nal);
         unsigned long    flags;
 
         state_lock(nal, &flags);
 
         LASSERT (getmd->pending > 0);
 
+        if (msg == NULL) {
+                CERROR ("Dropping REPLY from "LPU64": can't allocate msg\n",
+                        peer_nid);
+                goto drop;
+        }
+
         if (getmd->threshold == 0) {
                 CERROR ("Dropping REPLY from "LPU64" for inactive MD %p\n",
                         peer_nid, getmd);
-                goto drop;
+                goto drop_msg;
         }
 
         LASSERT (getmd->offset == 0);
 
         CDEBUG(D_NET, "Reply from "LPU64" md %p\n", peer_nid, getmd);
 
-        msg = get_new_msg (nal, getmd);
-        if (msg == NULL) {
-                CERROR("Dropping REPLY from "LPU64" md %p: can't allocate msg\n", 
-                       peer_nid, getmd);
-                goto drop;
-        }
+        lib_commit_md (nal, getmd, msg);
 
-        if (getmd->eq) {
-                msg->ev.type = PTL_EVENT_REPLY;
-                msg->ev.initiator.nid = peer_nid;
-                msg->ev.initiator.pid = 0;      /* XXX FIXME!!! */
-                msg->ev.rlength = msg->ev.mlength = getmd->length;
-                msg->ev.offset = 0;
+        msg->ev.type = PTL_EVENT_REPLY;
+        msg->ev.initiator.nid = peer_nid;
+        msg->ev.initiator.pid = 0;      /* XXX FIXME!!! */
+        msg->ev.rlength = msg->ev.mlength = getmd->length;
+        msg->ev.offset = 0;
 
-                lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc);
-        }
+        lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc);
 
         ni->counters.recv_count++;
         ni->counters.recv_length += getmd->length;
@@ -1228,7 +1239,9 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
         state_unlock(nal, &flags);
 
         return msg;
-        
+
+ drop_msg:
+        lib_msg_free(nal, msg);
  drop:
         nal->ni.counters.drop_count++;
         nal->ni.counters.drop_length += getmd->length;
@@ -1238,7 +1251,8 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
         return NULL;
 }
 
-int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int 
+do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret)
 {
         /*
          * Incoming:
@@ -1252,15 +1266,15 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
          * Outgoing:
          */
 
-        PtlGet_in *args = v_args;
-        PtlGet_out *ret = v_ret;
-        ptl_hdr_t hdr;
-        lib_msg_t *msg = NULL;
-        lib_ni_t *ni = &nal->ni;
+        PtlGet_in        *args = v_args;
         ptl_process_id_t *id = &args->target_in;
-        lib_md_t *md;
-        unsigned long flags;
-        int           rc;
+        PtlGet_out       *ret = v_ret;
+        lib_ni_t         *ni = &nal->ni;
+        lib_msg_t        *msg;
+        ptl_hdr_t         hdr;
+        lib_md_t         *md;
+        unsigned long     flags;
+        int               rc;
         
         if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
             fail_peer (nal, id->nid, 1))           /* shall we now? */
@@ -1269,16 +1283,24 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
                        nal->ni.nid, id->nid);
                 return (ret->rc = PTL_INV_PROC);
         }
-        
+
+        msg = lib_msg_alloc(nal);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping GET to "LPU64": ENOMEM on lib_msg_t\n",
+                       ni->nid, id->nid);
+                return (ret->rc = PTL_NOSPACE);
+        }
+
         state_lock(nal, &flags);
+
         md = ptl_handle2md(&args->md_in, nal);
         if (md == NULL || !md->threshold) {
+                lib_msg_free(nal, msg);
                 state_unlock(nal, &flags);
+
                 return ret->rc = PTL_INV_MD;
         }
 
-        LASSERT (md->offset == 0);
-
         CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid,
                (unsigned long)id->pid);
 
@@ -1299,51 +1321,33 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
         hdr.msg.get.src_offset = HTON__u32 (args->offset_in);
         hdr.msg.get.sink_length = HTON__u32 (md->length);
 
-        ni->counters.send_count++;
+        lib_commit_md(nal, md, msg);
 
-        msg = get_new_msg (nal, md);
-        if (msg == NULL) {
-                CERROR("do_PtlGet: BAD - could not allocate cookie!\n");
-                state_unlock(nal, &flags);
-                return ret->rc = PTL_NOSPACE;
-        }
+        msg->ev.type = PTL_EVENT_SENT;
+        msg->ev.initiator.nid = ni->nid;
+        msg->ev.initiator.pid = ni->pid;
+        msg->ev.portal = args->portal_in;
+        msg->ev.match_bits = args->match_bits_in;
+        msg->ev.rlength = md->length;
+        msg->ev.mlength = md->length;
+        msg->ev.offset = args->offset_in;
+        msg->ev.hdr_data = 0;
 
-        /*
-         * If this memory descriptor has an event queue associated with
-         * it we must allocate a message state object that will record
-         * the information to be filled in once the message has been
-         * completed.  More information is in the do_PtlPut() comments.
-         *
-         * NB. We're now committed to the GET, since we just marked the MD
-         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
-         * PtlMDUnlink()) expect a completion event to tell them when the
-         * MD becomes idle. 
-         */
-        if (md->eq) {
-                msg->ev.type = PTL_EVENT_SENT;
-                msg->ev.initiator.nid = ni->nid;
-                msg->ev.initiator.pid = ni->pid;
-                msg->ev.portal = args->portal_in;
-                msg->ev.match_bits = args->match_bits_in;
-                msg->ev.rlength = md->length;
-                msg->ev.mlength = md->length;
-                msg->ev.offset = args->offset_in;
-                msg->ev.hdr_data = 0;
-
-                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-        }
+        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+
+        ni->counters.send_count++;
 
         state_unlock(nal, &flags);
 
         rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET,
                        id->nid, id->pid, NULL, 0, 0);
         if (rc != PTL_OK) {
-                /* get_new_msg() committed us to sending by decrementing
-                 * md->threshold, so we have to act like we did send, but
-                 * the network dropped it. */
-                lib_finalize (nal, private, msg);
+                CERROR(LPU64": error sending GET to "LPU64": %d\n",
+                       ni->nid, id->nid, rc);
+                lib_finalize (nal, private, msg, rc);
         }
         
+        /* completion will be signalled by an event */
         return ret->rc = PTL_OK;
 }
 
index 9840ff5..04c69b1 100644 (file)
 
 #include <portals/lib-p30.h>
 
-int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
+void
+lib_enq_event_locked (nal_cb_t *nal, void *private, 
+                      lib_eq_t *eq, ptl_event_t *ev)
 {
-        lib_md_t     *md;
-        lib_eq_t     *eq;
+        ptl_event_t  *eq_slot;
         int           rc;
+        
+        ev->sequence = eq->sequence++; /* Allocate the next queue slot */
+
+        /* size must be a power of 2 to handle a wrapped sequence # */
+        LASSERT (eq->size != 0 &&
+                 eq->size == LOWEST_BIT_SET (eq->size));
+        eq_slot = eq->base + (ev->sequence & (eq->size - 1));
+
+        /* Copy the event into the allocated slot, ensuring all the rest of
+         * the event's contents have been copied _before_ the sequence
+         * number gets updated.  A processes 'getting' an event waits on
+         * the next queue slot's sequence to be 'new'.  When it is, _all_
+         * other event fields had better be consistent.  I assert
+         * 'sequence' is the last member, so I only need a 2 stage copy. */
+
+        LASSERT(sizeof (ptl_event_t) ==
+                offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
+
+        rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
+                            offsetof (ptl_event_t, sequence));
+        LASSERT (rc == PTL_OK);
+
+#ifdef __KERNEL__
+        barrier();
+#endif
+        /* Updating the sequence number is what makes the event 'new' NB if
+         * the cb_write below isn't atomic, this could cause a race with
+         * PtlEQGet */
+        rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
+                           (void *)&ev->sequence,sizeof (ev->sequence));
+        LASSERT (rc == PTL_OK);
+
+#ifdef __KERNEL__
+        barrier();
+#endif
+
+        if (nal->cb_callback != NULL)
+                nal->cb_callback(nal, private, eq, ev);
+        else if (eq->event_callback != NULL)
+                eq->event_callback(ev);
+}
+
+void 
+lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
+{
+        lib_md_t     *md;
+        int           unlink;
         unsigned long flags;
+        int           rc;
+        ptl_hdr_t     ack;
 
         /* ni went down while processing this message */
-        if (nal->ni.up == 0) {
-                return -1;
-        }
+        if (nal->ni.up == 0)
+                return;
 
         if (msg == NULL)
-                return 0;
+                return;
 
-        rc = 0;
-        if (msg->send_ack) {
-                ptl_hdr_t ack;
+        /* Only send an ACK if the PUT completed successfully */
+        if (status == PTL_OK &&
+            !ptl_is_wire_handle_none(&msg->ack_wmd)) {
 
-                LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd));
+                LASSERT(msg->ev.type == PTL_EVENT_PUT);
 
                 memset (&ack, 0, sizeof (ack));
                 ack.type     = HTON__u32 (PTL_MSG_ACK);
-                ack.dest_nid = HTON__u64 (msg->nid);
+                ack.dest_nid = HTON__u64 (msg->ev.initiator.nid);
                 ack.src_nid  = HTON__u64 (nal->ni.nid);
-                ack.dest_pid = HTON__u32 (msg->pid);
+                ack.dest_pid = HTON__u32 (msg->ev.initiator.pid);
                 ack.src_pid  = HTON__u32 (nal->ni.pid);
                 ack.payload_length = 0;
 
@@ -66,92 +115,35 @@ int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
                 ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength);
 
                 rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK,
-                               msg->nid, msg->pid, NULL, 0, 0);
-                /* If this send fails, there's nothing else to clean up */
+                               msg->ev.initiator.nid, msg->ev.initiator.pid, 
+                               NULL, 0, 0);
+                if (rc != PTL_OK) {
+                        /* send failed: there's nothing else to clean up. */
+                        CERROR("Error %d sending ACK to "LPX64"\n", 
+                               rc, msg->ev.initiator.nid);
+                }
         }
 
         md = msg->md;
-        LASSERT (md->pending > 0);  /* I've not dropped my ref yet */
-        eq = md->eq;
 
         state_lock(nal, &flags);
 
-        if (eq != NULL) {
-                ptl_event_t  *ev = &msg->ev;
-                ptl_event_t  *eq_slot;
-
-                /* I have to hold the lock while I bump the sequence number
-                 * and copy the event into the queue.  If not, and I was
-                 * interrupted after bumping the sequence number, other
-                 * events could fill the queue, including the slot I just
-                 * allocated to this event.  On resuming, I would overwrite
-                 * a more 'recent' event with old event state, and
-                 * processes taking events off the queue would not detect
-                 * overflow correctly.
-                 */
-
-                ev->sequence = eq->sequence++;/* Allocate the next queue slot */
-
-                /* size must be a power of 2 to handle a wrapped sequence # */
-                LASSERT (eq->size != 0 &&
-                         eq->size == LOWEST_BIT_SET (eq->size));
-                eq_slot = eq->base + (ev->sequence & (eq->size - 1));
-
-                /* Invalidate unlinked_me unless this is the last
-                 * event for an auto-unlinked MD.  Note that if md was
-                 * auto-unlinked, md->pending can only decrease
-                 */
-                if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */
-                    md->pending != 1)                       /* not last ref */
-                        ev->unlinked_me = PTL_HANDLE_NONE;
-
-                /* Copy the event into the allocated slot, ensuring all the
-                 * rest of the event's contents have been copied _before_
-                 * the sequence number gets updated.  A processes 'getting'
-                 * an event waits on the next queue slot's sequence to be
-                 * 'new'.  When it is, _all_ other event fields had better
-                 * be consistent.  I assert 'sequence' is the last member,
-                 * so I only need a 2 stage copy.
-                 */
-                LASSERT(sizeof (ptl_event_t) ==
-                        offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
-
-                rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
-                                    offsetof (ptl_event_t, sequence));
-                LASSERT (rc == 0);
-
-#ifdef __KERNEL__
-                barrier();
-#endif
-                /* Updating the sequence number is what makes the event 'new' */
-
-                /* cb_write is not necessarily atomic, so this could
-                   cause a race with PtlEQGet */
-                rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
-                                   (void *)&ev->sequence,sizeof (ev->sequence));
-                LASSERT (rc == 0);
+        /* Now it's safe to drop my caller's ref */
+        md->pending--;
+        LASSERT (md->pending >= 0);
 
-#ifdef __KERNEL__
-                barrier();
-#endif
+        /* Should I unlink this MD? */
+        unlink = (md->pending == 0 &&           /* No other refs */
+                  (md->threshold == 0 ||        /* All ops done */
+                   md->md_flags & PTL_MD_FLAG_UNLINK) != 0); /* black spot */
 
-                /* I must also ensure that (a) callbacks are made in the
-                 * same order as the events land in the queue, and (b) the
-                 * callback occurs before the event can be removed from the
-                 * queue, so I can't drop the lock during the callback. */
-                if (nal->cb_callback != NULL)
-                        nal->cb_callback(nal, private, eq, ev);
-                else  if (eq->event_callback != NULL)
-                        (void)((eq->event_callback) (ev));
-        }
+        msg->ev.status = status;
+        msg->ev.unlinked = unlink;
 
-        LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 ||
-                 (md->md_flags & PTL_MD_FLAG_UNLINK) != 0);
+        if (md->eq != NULL)
+                lib_enq_event_locked(nal, private, md->eq, &msg->ev);
 
-        md->pending--;
-        if (md->pending == 0 && /* no more outstanding operations on this md */
-            (md->threshold == 0 ||              /* done its business */
-             (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */
+        if (unlink)
                 lib_md_unlink(nal, md);
 
         list_del (&msg->msg_list);
@@ -159,6 +151,4 @@ int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
         lib_msg_free(nal, msg);
 
         state_unlock(nal, &flags);
-
-        return rc;
 }
index dc427b0..6035ca1 100644 (file)
@@ -1,5 +1,9 @@
 CPPFLAGS=
 INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
-lib_LIBRARIES = libtcpnal.a
+noinst_LIBRARIES = libtcpnal.a
 pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
 libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+
+if LIBLUSTRE
+libtcpnal_a_CFLAGS = -fPIC
+endif
index 0b4940f..9a90ab8 100644 (file)
@@ -6,6 +6,9 @@
  *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
  */
 
+#ifndef TCPNAL_PROCBRIDGE_H
+#define TCPNAL_PROCBRIDGE_H
+
 #include <portals/lib-p30.h>
 
 typedef struct bridge {
@@ -27,3 +30,5 @@ nal_t *bridge_init(ptl_interface_t nal,
 
 typedef int (*nal_initialize)(bridge);
 extern nal_initialize nal_table[PTL_IFACE_MAX];
+
+#endif
index 29e75be..ca6999a 100644 (file)
@@ -309,7 +309,8 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
  */
 connection force_tcp_connection(manager m,
                                 unsigned int ip,
-                                unsigned short port)
+                                unsigned short port,
+                                procbridge pb)
 {
     connection conn;
     struct sockaddr_in addr;
@@ -357,6 +358,10 @@ connection force_tcp_connection(manager m,
             exit(-1);
 
         conn = allocate_connection(m, ip, port, fd);
+
+        /* let nal thread know this event right away */
+        if (conn)
+                procbridge_wakeup_nal(pb);
     }
 
     pthread_mutex_unlock(&m->conn_lock);
index fb1eaab..343ffa6 100644 (file)
@@ -7,6 +7,7 @@
  */
 
 #include <table.h>
+#include <procbridge.h>
 
 typedef struct manager {
     table connections;
@@ -26,7 +27,8 @@ typedef struct connection {
     manager m;
 } *connection;
 
-connection force_tcp_connection(manager m, unsigned int ip, unsigned int short);
+connection force_tcp_connection(manager m, unsigned int ip, unsigned int short,
+                                procbridge pb);
 manager init_connections(unsigned short, int (*f)(void *, void *), void *);
 void remove_connection(void *arg);
 void shutdown_connections(manager m);
index 2a3fbd8..bddfe9a 100644 (file)
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
+#ifndef __CYGWIN__
+#include <syscall.h>
+#endif
+#include <sys/socket.h>
 #include <procbridge.h>
 #include <pqtimer.h>
 #include <dispatch.h>
 #include <errno.h>
 
 
+/* XXX CFS workaround, to give a chance to let nal thread wake up
+ * from waiting in select
+ */
+static int procbridge_notifier_handler(void *arg)
+{
+    static char buf[8];
+    procbridge p = (procbridge) arg;
+
+    syscall(SYS_read, p->notifier[1], buf, sizeof(buf));
+    return 1;
+}
+
+void procbridge_wakeup_nal(procbridge p)
+{
+    static char buf[8];
+    syscall(SYS_write, p->notifier[0], buf, sizeof(buf));
+}
+
 /* Function: forward
  * Arguments: nal_t *nal: pointer to my top-side nal structure
  *            id: the command to pass to the lower layer
@@ -79,6 +101,7 @@ static int procbridge_shutdown(nal_t *n, int ni)
     procbridge p=(procbridge)b->local;
 
     p->nal_flags |= NAL_FLAG_STOPPING;
+    procbridge_wakeup_nal(p);
 
     do {
         pthread_mutex_lock(&p->mutex);
@@ -104,6 +127,12 @@ static int procbridge_validate(nal_t *nal, void *base, size_t extent)
 }
 
 
+/* FIXME cfs temporary workaround! FIXME
+ * global time out value
+ */
+int __tcpnal_eqwait_timeout_value = 0;
+int __tcpnal_eqwait_timedout = 0;
+
 /* Function: yield
  * Arguments:  pid:
  *
@@ -118,7 +147,19 @@ static void procbridge_yield(nal_t *n)
     procbridge p=(procbridge)b->local;
 
     pthread_mutex_lock(&p->mutex);
-    pthread_cond_wait(&p->cond,&p->mutex);
+    if (!__tcpnal_eqwait_timeout_value) {
+        pthread_cond_wait(&p->cond,&p->mutex);
+    } else {
+        struct timeval now;
+        struct timespec timeout;
+
+        gettimeofday(&now, NULL);
+        timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value;
+        timeout.tv_nsec = now.tv_usec * 1000;
+
+        __tcpnal_eqwait_timedout =
+                pthread_cond_timedwait(&p->cond, &p->mutex, &timeout);
+    }
     pthread_mutex_unlock(&p->mutex);
 }
 
@@ -194,6 +235,19 @@ nal_t *procbridge_interface(int num_interface,
     p->nal_flags = 0;
     pthread_mutex_init(&p->nal_cb_lock, 0);
 
+    /* initialize notifier */
+    if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) {
+        perror("socketpair failed");
+        return NULL;
+    }
+
+    if (!register_io_handler(p->notifier[1], READ_HANDLER,
+                procbridge_notifier_handler, p)) {
+        perror("fail to register notifier handler");
+        return NULL;
+    }
+
+    /* create nal thread */
     if (pthread_create(&p->t, NULL, nal_thread, &args)) {
         perror("nal_init: pthread_create");
         return(NULL);
index 317e22f..965f83d 100644 (file)
@@ -25,6 +25,9 @@ typedef struct procbridge {
     pthread_cond_t cond;
     pthread_mutex_t mutex;
 
+    /* socket pair used to notify nal thread */
+    int notifier[2];
+
     int nal_flags;
 
     pthread_mutex_t nal_cb_lock;
@@ -51,5 +54,6 @@ extern nal_t *procbridge_interface(int num_interface,
                                    ptl_pt_index_t ptl_size,
                                    ptl_ac_index_t acl_size,
                                    ptl_pid_t requested_pid);
+extern void procbridge_wakeup_nal(procbridge p);
 
 #endif
index 2627253..2a5ba0d 100644 (file)
 /* the following functions are stubs to satisfy the nal definition
    without doing anything particularily useful*/
 
-static int nal_write(nal_cb_t *nal,
-                     void *private,
-                     user_ptr dst_addr,
-                     void *src_addr,
-                     size_t len)
+static ptl_err_t nal_write(nal_cb_t *nal,
+                           void *private,
+                           user_ptr dst_addr,
+                           void *src_addr,
+                           size_t len)
 {
     memcpy(dst_addr, src_addr, len);
-    return 0;
+    return PTL_OK;
 }
 
-static int nal_read(nal_cb_t * nal,
-                    void *private,
-                   void *dst_addr,
-                   user_ptr src_addr,
-                   size_t len)
+static ptl_err_t nal_read(nal_cb_t * nal,
+                          void *private,
+                          void *dst_addr,
+                          user_ptr src_addr,
+                          size_t len)
 {
        memcpy(dst_addr, src_addr, len);
-       return 0;
+       return PTL_OK;
 }
 
 static void *nal_malloc(nal_cb_t *nal,
index fe24efc..c4ccae1 100644 (file)
@@ -126,15 +126,6 @@ void select_timer_block(when until)
         timeout_pointer=&timeout;
     } else timeout_pointer=0;
 
-
-    /* FIXME
-     * temporarily add timer for endless waiting problem.
-     * FIXME
-     */
-    timeout.tv_sec = 1;
-    timeout.tv_usec = 0;
-    timeout_pointer=&timeout;
-
     FD_ZERO(&fds[0]);
     FD_ZERO(&fds[1]);
     FD_ZERO(&fds[2]);
index 1041d1d..0c47f42 100644 (file)
  *
  * sends a packet to the peer, after insuring that a connection exists
  */
-int tcpnal_send(nal_cb_t *n,
-               void *private,
-               lib_msg_t *cookie,
-               ptl_hdr_t *hdr,
-               int type,
-               ptl_nid_t nid,
-               ptl_pid_t pid,
-                unsigned int niov,
-                struct iovec *iov,
-               size_t len)
+ptl_err_t tcpnal_send(nal_cb_t *n,
+                      void *private,
+                      lib_msg_t *cookie,
+                      ptl_hdr_t *hdr,
+                      int type,
+                      ptl_nid_t nid,
+                      ptl_pid_t pid,
+                      unsigned int niov,
+                      struct iovec *iov,
+                      size_t offset,
+                      size_t len)
 {
     connection c;
     bridge b=(bridge)n->nal_data;
     struct iovec tiov[257];
     static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER;
-    int   rc;
+    ptl_err_t rc = PTL_OK;
+    int   sysrc;
     int   total;
+    int   ntiov;
     int i;
 
     if (!(c=force_tcp_connection((manager)b->lower,
                                  PNAL_IP(nid,b),
-                                 PNAL_PORT(nid,pid)))) 
-        return(1);
+                                 PNAL_PORT(nid,pid),
+                                 b->local)))
+        return(PTL_FAIL);
 
-#if 0
     /* TODO: these results should be checked. furthermore, provision
        must be made for the SIGPIPE which is delivered when
        writing on a tcp socket which has closed underneath
        the application. there is a linux flag in the sendmsg
        call which turns off the signally behaviour, but its
        nonstandard */
-    syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
-    LASSERT (niov <= 1);
-    if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
-#else
+
     LASSERT (niov <= 256);
 
     tiov[0].iov_base = hdr;
     tiov[0].iov_len = sizeof(ptl_hdr_t);
+    ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len);
 
-    if (niov > 0)
-            memcpy(&tiov[1], iov, niov * sizeof(struct iovec));
     pthread_mutex_lock(&send_lock);
 #if 1
-    for (i = total = 0; i <= niov; i++)
+    for (i = total = 0; i < ntiov; i++)
             total += tiov[i].iov_len;
     
-    rc = syscall(SYS_writev, c->fd, tiov, niov+1);
-    if (rc != total) {
+    sysrc = syscall(SYS_writev, c->fd, tiov, ntiov);
+    if (sysrc != total) {
             fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
                      rc, total, errno);
-            abort();
+            rc = PTL_FAIL;
     }
 #else
-    for (i = total = 0; i <= niov; i++) {
+    for (i = total = 0; i <= ntiov; i++) {
             rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0);
             
             if (rc != tiov[i].iov_len) {
                     fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
                              rc, tiov[i].iov_len, errno);
-                    abort();
+                    rc = PTL_FAIL;
+                    break;
             }
-            total != rc;
+            total += rc;
     }
 #endif
 #if 0
@@ -130,10 +130,14 @@ int tcpnal_send(nal_cb_t *n,
              total, niov + 1);
 #endif
     pthread_mutex_unlock(&send_lock);
-#endif
-    lib_finalize(n, private, cookie);
-        
-    return(0);
+
+    if (rc == PTL_OK) {
+            /* NB the NAL only calls lib_finalize() if it returns PTL_OK
+             * from cb_send() */
+            lib_finalize(n, private, cookie, PTL_OK);
+    }
+
+    return(rc);
 }
 
 
@@ -150,15 +154,18 @@ int tcpnal_send(nal_cb_t *n,
  * blocking read of the requested data. must drain out the
  * difference of mainpulated and requested lengths from the network
  */
-int tcpnal_recv(nal_cb_t *n,
-               void *private,
-               lib_msg_t *cookie,
-                unsigned int niov,
-                struct iovec *iov,
-               size_t mlen,
-               size_t rlen)
+ptl_err_t tcpnal_recv(nal_cb_t *n,
+                      void *private,
+                      lib_msg_t *cookie,
+                      unsigned int niov,
+                      struct iovec *iov,
+                      size_t offset,
+                      size_t mlen,
+                      size_t rlen)
 
 {
+    struct iovec tiov[256];
+    int ntiov;
     int i;
 
     if (!niov)
@@ -168,16 +175,19 @@ int tcpnal_recv(nal_cb_t *n,
     LASSERT(rlen);
     LASSERT(rlen >= mlen);
 
+    ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen);
+    
     /* FIXME
      * 1. Is this effecient enough? change to use readv() directly?
      * 2. need check return from read_connection()
      * - MeiJia
      */
-    for (i = 0; i < niov; i++)
-        read_connection(private, iov[i].iov_base, iov[i].iov_len);
+    for (i = 0; i < ntiov; i++)
+        read_connection(private, tiov[i].iov_base, tiov[i].iov_len);
 
 finalize:
-    lib_finalize(n, private, cookie);
+    /* FIXME; we always assume success here... */
+    lib_finalize(n, private, cookie, PTL_OK);
 
     if (mlen!=rlen){
         char *trash=malloc(rlen-mlen);
@@ -187,7 +197,7 @@ finalize:
         free(trash);
     }
 
-    return(rlen);
+    return(PTL_OK);
 }
 
 
index f1878df..6c31b3d 100644 (file)
@@ -3,17 +3,18 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-
 COMPILE = $(CC) -Wall -g -I$(srcdir)/../include
 LINK = $(CC) -o $@
 
 if LIBLUSTRE
-tmp=
+
+noinst_LIBRARIES = libuptlctl.a
+libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+libuptlctl_a_CFLAGS = -fPIC
+
 else
-tmp=gmnalnid
-endif
 
-sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck $(tmp)
+sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
 lib_LIBRARIES = libptlctl.a
 
 acceptor_SOURCES = acceptor.c # -lefence
@@ -33,3 +34,4 @@ debugctl_LDADD = -L. -lptlctl -lncurses # -lefence
 debugctl_DEPENDENCIES = libptlctl.a
 
 routerstat_SOURCES = routerstat.c
+endif
index c6628ff..58a408a 100644 (file)
@@ -23,7 +23,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <syscall.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <portals/api-support.h>
 #include <portals/ptlctl.h>
 
+#ifndef __CYGWIN__
+ #include <syscall.h>
+#else
+ #include <windows.h>
+ #include <windef.h>
+#endif
+
+static ioc_handler_t  do_ioctl;                 /* forward ref */
+static ioc_handler_t *current_ioc_handler = &do_ioctl;
+
 struct ioc_dev {
        const char * dev_name;
        int dev_fd;
@@ -48,7 +57,16 @@ struct dump_hdr {
        int opc;
 };
 
-char * dump_filename;
+char *dump_filename;
+
+void
+set_ioc_handler (ioc_handler_t *handler)
+{
+        if (handler == NULL)
+                current_ioc_handler = do_ioctl;
+        else
+                current_ioc_handler = handler;
+}
 
 static int
 open_ioc_dev(int dev_id) 
@@ -115,7 +133,7 @@ dump(int dev_id, int opc, void *buf)
 {
        FILE *fp;
        struct dump_hdr dump_hdr;
-       struct portal_ioctl_hdr * ioc_hdr = (struct  portal_ioctl_hdr *) buf;
+        struct portal_ioctl_hdr * ioc_hdr = (struct  portal_ioctl_hdr *) buf;
        int rc;
        
        printf("dumping opc %x to %s\n", opc, dump_filename);
@@ -132,17 +150,17 @@ dump(int dev_id, int opc, void *buf)
                return -EINVAL;
        }
        
-       rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
-       if (rc == 1)
-               rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
-       fclose(fp);
-       if (rc != 1) {
-               fprintf(stderr, "%s: %s\n", dump_filename, 
-                       strerror(errno));
-               return -EINVAL;
-       }
-       
-       return 0;
+        rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
+        if (rc == 1)
+                rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
+        fclose(fp);
+        if (rc != 1) {
+                fprintf(stderr, "%s: %s\n", dump_filename,
+                        strerror(errno));
+                return -EINVAL;
+        }
+
+        return 0;
 }
 
 /* register a device to send ioctls to.  */
@@ -184,16 +202,17 @@ set_ioctl_dump(char * file)
                free(dump_filename);
        
        dump_filename = strdup(file);
+        if (dump_filename == NULL)
+                abort();
+
+        set_ioc_handler(&dump);
        return 0;
 }
 
 int
 l_ioctl(int dev_id, int opc, void *buf)
 {
-       if (dump_filename) 
-               return dump(dev_id, opc, buf);
-       else 
-               return do_ioctl(dev_id, opc, buf);
+        return current_ioc_handler(dev_id, opc, buf);
 }
 
 /* Read an ioctl dump file, and call the ioc_func for each ioctl buffer
@@ -207,16 +226,28 @@ l_ioctl(int dev_id, int opc, void *buf)
 int 
 parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
 {
-       int fd, line =0;
+       int line =0;
        struct stat st;
-       char *buf, *end;
+       char *start, *buf, *end;
+#ifndef __CYGWIN__
+        int fd;
+#else
+        HANDLE fd, hmap;
+        DWORD size;
+#endif
        
+#ifndef __CYGWIN__
        fd = syscall(SYS_open, dump_file, O_RDONLY);
+        if (fd < 0) {
+                fprintf(stderr, "couldn't open %s: %s\n", dump_file, 
+                        strerror(errno));
+                exit(1);
+        }
 
 #ifndef SYS_fstat64
-#define __SYS_fstat__ SYS_fstat
+# define __SYS_fstat__ SYS_fstat
 #else
-#define __SYS_fstat__ SYS_fstat64
+# define __SYS_fstat__ SYS_fstat64
 #endif
        if (syscall(__SYS_fstat__, fd, &st)) { 
                perror("stat fails");
@@ -228,41 +259,72 @@ parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
                exit(1);
        }
 
-       buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
-       end = buf + st.st_size;
+       start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
+       end = start + st.st_size;
        close(fd);
-       while (buf < end) {
-               struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
-               struct portal_ioctl_hdr * data;
-               char tmp[8096];
-               int rc;
-               
-               line++;
+        if (start == MAP_FAILED) {
+               fprintf(stderr, "can't create file mapping\n");
+               exit(1);
+        }
+#else
+        fd = CreateFile(dump_file, GENERIC_READ, FILE_SHARE_READ, NULL,
+                        OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+        size = GetFileSize(fd, NULL);
+        if (size < 1) {
+               fprintf(stderr, "KML is empty\n");
+               exit(1);
+       }
 
-               data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
-               if (buf + data->ioc_len > end ) {
-                       fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
-                               data->ioc_len, end);
-                       return -1;
-               }
+        hmap = CreateFileMapping(fd, NULL, PAGE_READONLY, 0,0, NULL);
+        start = buf = MapViewOfFile(hmap, FILE_MAP_READ, 0, 0, 0);
+        end = buf + size;
+        CloseHandle(fd);
+        if (start == NULL) {
+               fprintf(stderr, "can't create file mapping\n");
+               exit(1);
+        }
+#endif /* __CYGWIN__ */
+
+       while (buf < end) {
+                struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
+                struct portal_ioctl_hdr * data;
+                char tmp[8096];
+                int rc;
+
+                line++;
+
+                data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
+                if (buf + data->ioc_len > end ) {
+                        fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
+                                data->ioc_len, end);
+                        return -1;
+                }
 #if 0
-               printf ("dump_hdr: %lx data: %lx\n",
-                       (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
-               
-               printf("%d: opcode %x len: %d  ver: %x ", line, dump_hdr->opc,
-                      data->ioc_len, data->ioc_version);
+                printf ("dump_hdr: %lx data: %lx\n",
+                        (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
+
+                printf("%d: opcode %x len: %d  ver: %x ", line, dump_hdr->opc,
+                       data->ioc_len, data->ioc_version);
 #endif
 
-               memcpy(tmp, data, data->ioc_len);
+                memcpy(tmp, data, data->ioc_len);
 
-               rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
-               if (rc) {
-                       printf("failed: %d\n", rc);
-                       exit(1);
-               }
+                rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
+                if (rc) {
+                        printf("failed: %d\n", rc);
+                        exit(1);
+                }
 
-               buf += data->ioc_len + sizeof(*dump_hdr);
+                buf += data->ioc_len + sizeof(*dump_hdr);
        }
+
+#ifndef __CYGWIN__
+        munmap(start, end - start);
+#else
+        UnmapViewOfFile(start);
+        CloseHandle(hmap);
+#endif
+
        return 0;
 }
 
index 3c7ec20..fb031ae 100644 (file)
 #include <stdarg.h>
 #include <asm/byteorder.h>
 
+#ifdef __CYGWIN__
+
+#include <netinet/in.h>
+
+#warning assuming little endian
+
+#define __cpu_to_le64(x) ((__u64)(x))
+#define __le64_to_cpu(x) ((__u64)(x))
+#define __cpu_to_le32(x) ((__u32)(x))
+#define __le32_to_cpu(x) ((__u32)(x))
+#define __cpu_to_le16(x) ((__u16)(x))
+#define __le16_to_cpu(x) ((__u16)(x))
+
+#endif /* __CYGWIN__ */
 #include <portals/api-support.h>
 #include <portals/ptlctl.h>
 #include <portals/list.h>
@@ -94,6 +109,9 @@ pcfg_ioctl(struct portals_cfg *pcfg)
                 PORTAL_IOC_INIT (data);
                 data.ioc_pbuf1   = (char*)pcfg;
                 data.ioc_plen1   = sizeof(*pcfg);
+                /* XXX liblustre hack XXX */
+                data.ioc_nal_cmd = pcfg->pcfg_command;
+                data.ioc_nid = pcfg->pcfg_nid;
 
                 rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
         }
index 51883f2..5af3249 100644 (file)
@@ -69,19 +69,19 @@ int ptlbd_send_rw_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd,
         op->op_block_cnt = page_count;
 
         if (cmd == PTLBD_READ) 
-                desc = ptlrpc_prep_bulk_imp (req, BULK_PUT_SINK, PTLBD_BULK_PORTAL);
+                desc = ptlrpc_prep_bulk_imp (req, page_count,
+                                             BULK_PUT_SINK, PTLBD_BULK_PORTAL);
         else
-                desc = ptlrpc_prep_bulk_imp (req, BULK_GET_SOURCE, PTLBD_BULK_PORTAL);
+                desc = ptlrpc_prep_bulk_imp (req, page_count,
+                                             BULK_GET_SOURCE, PTLBD_BULK_PORTAL);
         if ( desc == NULL )
                 GOTO(out, rc = 1);              /* need to return error cnt */
         /* NB req now owns desc, and frees it when she frees herself */
         
         for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_reqnext, niob++ ) {
-                rc = ptlrpc_prep_bulk_page(desc, bh->b_page,
-                                           bh_offset (bh) & (PAGE_SIZE - 1),
-                                           bh->b_size);
-                if (rc != 0)
-                        GOTO(out, rc = 1);      /* need to return error cnt */
+                ptlrpc_prep_bulk_page(desc, bh->b_page,
+                                      bh_offset (bh) & (PAGE_SIZE - 1),
+                                      bh->b_size);
 
                 niob->n_block_nr = bh->b_blocknr;
                 niob->n_offset = bh_offset(bh);
@@ -221,6 +221,7 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index,
         if ( rsp == NULL )
                 GOTO (out, rc = -EFAULT);
 
+        /* FIXME: assumes each niobuf fits in 1 page */
         page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob);
         if (swab) {                             /* swab remaining niobs */
                 for (i = 1; i < page_count; i++)
@@ -232,9 +233,11 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index,
         }
         
         if (cmd == PTLBD_READ)
-                desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, PTLBD_BULK_PORTAL);
+                desc = ptlrpc_prep_bulk_exp (req, page_count, 
+                                             BULK_PUT_SOURCE, PTLBD_BULK_PORTAL);
         else
-                desc = ptlrpc_prep_bulk_exp (req, BULK_GET_SINK, PTLBD_BULK_PORTAL);
+                desc = ptlrpc_prep_bulk_exp (req, page_count,
+                                             BULK_GET_SINK, PTLBD_BULK_PORTAL);
         if (desc == NULL) {
                 error_cnt++;
                 GOTO(out_reply, rc = -ENOMEM);
@@ -250,25 +253,20 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index,
                 }
                 list_add_tail(&page->list, &tmp_pages);
 
-                rc = ptlrpc_prep_bulk_page(desc, page,
-                                           niob->n_offset & (PAGE_SIZE - 1),
-                                           niob->n_length);
-                if (rc != 0) {
-                        error_cnt++;
-                        GOTO(out_reply, rc);
-                }
+                ptlrpc_prep_bulk_page(desc, page,
+                                      niob->n_offset & (PAGE_SIZE - 1),
+                                      niob->n_length);
         }
 
         if ( cmd == PTLBD_READ ) {
-                if ((rc = ptlbd_do_filp(filp, PTLBD_READ, niobs, 
-                                        page_count, &tmp_pages)) < 0) {
+                rc = ptlbd_do_filp(filp, PTLBD_READ, niobs, 
+                                   page_count, &tmp_pages);
+                if (rc < 0) {
                         error_cnt++;
                         GOTO(out_reply, rc);
                 }
-                rc = ptlrpc_bulk_put(desc);
-        } else {
-                rc = ptlrpc_bulk_get(desc);
         }
+        rc = ptlrpc_start_bulk_transfer(desc);
 
         if ( rc ) {
                 error_cnt++;
@@ -276,13 +274,16 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index,
         }
 
         lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, desc);
-        rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi);
+        rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi);
         if (rc != 0) {
                 LASSERT(rc == -ETIMEDOUT);
                 ptlrpc_abort_bulk(desc);
                 error_cnt++;
                 GOTO(out_reply, rc);
         }
+
+        /* XXX do some error handling */
+        LASSERT(desc->bd_success && desc->bd_nob_transferred == desc->bd_nob);
         
         if ( cmd == PTLBD_WRITE ) {
                 if ((rc = ptlbd_do_filp(filp, PTLBD_WRITE, niobs, 
index e159df0..c7ac53b 100644 (file)
@@ -49,10 +49,9 @@ static int ptlbd_sv_setup(struct obd_device *obddev, obd_count len, void *buf)
                 RETURN(PTR_ERR(ptlbd->filp));
 
         ptlbd->ptlbd_service =
-                ptlrpc_init_svc(PTLBD_NEVENTS, PTLBD_NBUFS, PTLBD_BUFSIZE,
-                                PTLBD_MAXREQSIZE, PTLBD_REQUEST_PORTAL,
-                                PTLBD_REPLY_PORTAL,
-                                ptlbd_handle, "ptlbd_sv", 
+                ptlrpc_init_svc(PTLBD_NBUFS, PTLBD_BUFSIZE, PTLBD_MAXREQSIZE,
+                                PTLBD_REQUEST_PORTAL, PTLBD_REPLY_PORTAL,
+                                ptlbd_handle, "ptlbd_sv",
                                 obddev->obd_proc_entry);
 
         if (ptlbd->ptlbd_service == NULL) 
index 2c6de45..4822d33 100644 (file)
@@ -5,19 +5,21 @@
 
 DEFS=
 
-LDLMSOURCES= $(top_srcdir)/ldlm/l_lock.c $(top_srcdir)/ldlm/ldlm_lock.c \
-   $(top_srcdir)/ldlm/ldlm_resource.c $(top_srcdir)/ldlm/ldlm_lib.c     \
-   $(top_srcdir)/ldlm/ldlm_plain.c $(top_srcdir)/ldlm/ldlm_extent.c     \
-   $(top_srcdir)/ldlm/ldlm_flock.c $(top_srcdir)/ldlm/ldlm_request.c    \
-   $(top_srcdir)/ldlm/ldlm_lockd.c $(top_srcdir)/ldlm/ldlm_internal.h
+LDLM_COMM_SOURCES= $(top_srcdir)/ldlm/l_lock.c $(top_srcdir)/ldlm/ldlm_lock.c \
+    $(top_srcdir)/ldlm/ldlm_resource.c $(top_srcdir)/ldlm/ldlm_lib.c          \
+    $(top_srcdir)/ldlm/ldlm_plain.c $(top_srcdir)/ldlm/ldlm_extent.c          \
+    $(top_srcdir)/ldlm/ldlm_request.c $(top_srcdir)/ldlm/ldlm_lockd.c         \
+    $(top_srcdir)/ldlm/ldlm_internal.h
 
-COMMON_SOURCES =  client.c recover.c connection.c niobuf.c pack_generic.c \
-    events.c ptlrpc_module.c service.c pinger.c recov_thread.c llog_net.c \
-    llog_client.c import.c ptlrpcd.c $(LDLMSOURCES)
+COMMON_SOURCES =  client.c recover.c connection.c niobuf.c pack_generic.c   \
+    events.c ptlrpc_module.c service.c pinger.c recov_thread.c llog_net.c   \
+    llog_client.c llog_server.c import.c ptlrpcd.c ptlrpc_internal.h        \
+    $(LDLM_COMM_SOURCES)
 
 if LIBLUSTRE
 
-lib_LIBRARIES = libptlrpc.a
+noinst_LIBRARIES = libptlrpc.a
+libptlrpc_a_CFLAGS = -fPIC
 libptlrpc_a_SOURCES = $(COMMON_SOURCES)
 
 else
@@ -26,8 +28,9 @@ MODULE = ptlrpc
 modulefs_DATA = ptlrpc.o
 EXTRA_PROGRAMS = ptlrpc
 
-ptlrpc_SOURCES = $(COMMON_SOURCES) lproc_ptlrpc.c ptlrpc_internal.h \
-    llog_server.c
+ptlrpc_SOURCES = $(top_srcdir)/ldlm/ldlm_flock.c $(COMMON_SOURCES) \
+    lproc_ptlrpc.c
+
 endif
 
 ptlrpc_DEPENDENCIES=symlinks
index fdc1b37..84c781d 100644 (file)
@@ -82,40 +82,42 @@ void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,
         return;
 }
 
-static inline struct ptlrpc_bulk_desc *new_bulk(void)
+static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal)
 {
         struct ptlrpc_bulk_desc *desc;
 
-        OBD_ALLOC(desc, sizeof(*desc));
+        OBD_ALLOC(desc, offsetof (struct ptlrpc_bulk_desc, bd_iov[npages]));
         if (!desc)
                 return NULL;
 
         spin_lock_init(&desc->bd_lock);
         init_waitqueue_head(&desc->bd_waitq);
-        INIT_LIST_HEAD(&desc->bd_page_list);
+        desc->bd_max_pages = npages;
+        desc->bd_page_count = 0;
         desc->bd_md_h = PTL_HANDLE_NONE;
-        desc->bd_me_h = PTL_HANDLE_NONE;
-
+        desc->bd_portal = portal;
+        desc->bd_type = type;
+        
         return desc;
 }
 
 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
-                                               int type, int portal)
+                                               int npages, int type, int portal)
 {
         struct obd_import *imp = req->rq_import;
         struct ptlrpc_bulk_desc *desc;
 
         LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
-
-        desc = new_bulk();
+        desc = new_bulk(npages, type, portal);
         if (desc == NULL)
                 RETURN(NULL);
 
         desc->bd_import_generation = req->rq_import_generation;
         desc->bd_import = class_import_get(imp);
         desc->bd_req = req;
-        desc->bd_type = type;
-        desc->bd_portal = portal;
+
+        desc->bd_cbid.cbid_fn  = client_bulk_callback;
+        desc->bd_cbid.cbid_arg = desc;
 
         /* This makes req own desc, and free it when she frees herself */
         req->rq_bulk = desc;
@@ -124,21 +126,22 @@ struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
 }
 
 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp (struct ptlrpc_request *req,
-                                               int type, int portal)
+                                               int npages, int type, int portal)
 {
         struct obd_export *exp = req->rq_export;
         struct ptlrpc_bulk_desc *desc;
 
         LASSERT(type == BULK_PUT_SOURCE || type == BULK_GET_SINK);
 
-        desc = new_bulk();
+        desc = new_bulk(npages, type, portal);
         if (desc == NULL)
                 RETURN(NULL);
 
         desc->bd_export = class_export_get(exp);
         desc->bd_req = req;
-        desc->bd_type = type;
-        desc->bd_portal = portal;
+
+        desc->bd_cbid.cbid_fn  = server_bulk_callback;
+        desc->bd_cbid.cbid_arg = desc;
 
         /* NB we don't assign rq_bulk here; server-side requests are
          * re-used, and the handler frees the bulk desc explicitly. */
@@ -146,66 +149,50 @@ struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp (struct ptlrpc_request *req,
         return desc;
 }
 
-int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
-                          struct page *page, int pageoffset, int len)
+void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+                           struct page *page, int pageoffset, int len)
 {
-        struct ptlrpc_bulk_page *bulk;
-
-        OBD_ALLOC(bulk, sizeof(*bulk));
-        if (bulk == NULL)
-                return -ENOMEM;
-
+#ifdef __KERNEL__
+        ptl_kiov_t *kiov = &desc->bd_iov[desc->bd_page_count];
+#else
+        struct iovec *iov = &desc->bd_iov[desc->bd_page_count];
+#endif
+        LASSERT(desc->bd_page_count < desc->bd_max_pages);
         LASSERT(page != NULL);
         LASSERT(pageoffset >= 0);
         LASSERT(len > 0);
         LASSERT(pageoffset + len <= PAGE_SIZE);
 
-        bulk->bp_page = page;
-        bulk->bp_pageoffset = pageoffset;
-        bulk->bp_buflen = len;
-
-        bulk->bp_desc = desc;
-        list_add_tail(&bulk->bp_link, &desc->bd_page_list);
+#ifdef __KERNEL__
+        kiov->kiov_page   = page;
+        kiov->kiov_offset = pageoffset;
+        kiov->kiov_len    = len;
+#else
+        iov->iov_base = page->addr + pageoffset;
+        iov->iov_len  = len;
+#endif
         desc->bd_page_count++;
-        return 0;
+        desc->bd_nob += len;
 }
 
 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
 {
-        struct list_head *tmp, *next;
         ENTRY;
 
         LASSERT(desc != NULL);
         LASSERT(desc->bd_page_count != 0x5a5a5a5a); /* not freed already */
         LASSERT(!desc->bd_network_rw);         /* network hands off or */
-
-        list_for_each_safe(tmp, next, &desc->bd_page_list) {
-                struct ptlrpc_bulk_page *bulk;
-                bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
-                ptlrpc_free_bulk_page(bulk);
-        }
-
-        LASSERT(desc->bd_page_count == 0);
         LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
-
         if (desc->bd_export)
                 class_export_put(desc->bd_export);
         else
                 class_import_put(desc->bd_import);
 
-        OBD_FREE(desc, sizeof(*desc));
+        OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, 
+                                bd_iov[desc->bd_max_pages]));
         EXIT;
 }
 
-void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *bulk)
-{
-        LASSERT(bulk != NULL);
-
-        list_del(&bulk->bp_link);
-        bulk->bp_desc->bd_page_count--;
-        OBD_FREE(bulk, sizeof(*bulk));
-}
-
 struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
                                        int count, int *lengths, char **bufs)
 {
@@ -235,6 +222,13 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
         request->rq_send_state = LUSTRE_IMP_FULL;
         request->rq_type = PTL_RPC_MSG_REQUEST;
         request->rq_import = class_import_get(imp);
+
+        request->rq_req_cbid.cbid_fn  = request_out_callback;
+        request->rq_req_cbid.cbid_arg = request;
+
+        request->rq_reply_cbid.cbid_fn  = reply_in_callback;
+        request->rq_reply_cbid.cbid_arg = request;
+        
         request->rq_phase = RQ_PHASE_NEW;
 
         /* XXX FIXME bug 249 */
@@ -462,7 +456,6 @@ static int after_reply(struct ptlrpc_request *req)
         ENTRY;
 
         LASSERT(!req->rq_receiving_reply);
-        LASSERT(req->rq_replied);
 
         /* NB Until this point, the whole of the incoming message,
          * including buflens, status etc is in the sender's byte order. */
@@ -471,7 +464,8 @@ static int after_reply(struct ptlrpc_request *req)
         /* Clear reply swab mask; this is a new reply in sender's byte order */
         req->rq_rep_swab_mask = 0;
 #endif
-        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
+        LASSERT (req->rq_nob_received <= req->rq_replen);
+        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
         if (rc) {
                 CERROR("unpack_rep failed: %d\n", rc);
                 RETURN(-EPROTO);
@@ -658,6 +652,11 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                 if (req->rq_phase == RQ_PHASE_RPC) {
                         if (req->rq_waiting || req->rq_resend) {
                                 int status;
+
+                                LASSERT (!ptlrpc_client_receiving_reply(req));
+                                LASSERT (req->rq_bulk == NULL ||
+                                         !ptlrpc_bulk_active(req->rq_bulk));
+
                                 spin_lock_irqsave(&imp->imp_lock, flags);
 
                                 if (ptlrpc_import_delay_req(imp, req, &status)) {
@@ -686,7 +685,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                                         ptlrpc_unregister_reply(req);
                                         if (req->rq_bulk) {
                                                 __u64 old_xid = req->rq_xid;
-                                                ptlrpc_unregister_bulk(req);
+
                                                 /* ensure previous bulk fails */
                                                 req->rq_xid = ptlrpc_next_xid();
                                                 CDEBUG(D_HA, "resend bulk "
@@ -707,13 +706,13 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                                 force_timer_recalc = 1;
                         }
 
-                        /* Ensure the network callback returned */
-                        spin_lock_irqsave (&req->rq_lock, flags);
-                        if (!req->rq_replied) {
-                                spin_unlock_irqrestore (&req->rq_lock, flags);
+                        /* Still waiting for a reply? */
+                        if (ptlrpc_client_receiving_reply(req))
+                                continue;
+
+                        /* Did we actually receive a reply? */
+                        if (!ptlrpc_client_replied(req))
                                 continue;
-                        }
-                        spin_unlock_irqrestore (&req->rq_lock, flags);
 
                         spin_lock_irqsave(&imp->imp_lock, flags);
                         list_del_init(&req->rq_list);
@@ -745,9 +744,18 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                 }
 
                 LASSERT(req->rq_phase == RQ_PHASE_BULK);
-                if (!ptlrpc_bulk_complete (req->rq_bulk))
+                if (ptlrpc_bulk_active(req->rq_bulk))
                         continue;
 
+                if (!req->rq_bulk->bd_success) {
+                        /* The RPC reply arrived OK, but the bulk screwed
+                         * up!  Dead wierd since the server told us the RPC
+                         * was good after getting the REPLY for her GET or
+                         * the ACK for her PUT. */
+                        DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+                        LBUG();
+                }
+
                 req->rq_phase = RQ_PHASE_INTERPRET;
 
         interpret:
@@ -796,6 +804,9 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req)
 
         ptlrpc_unregister_reply (req);
 
+        if (req->rq_bulk != NULL)
+                ptlrpc_unregister_bulk (req);
+
         if (imp == NULL) {
                 DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
                 RETURN(1);
@@ -926,7 +937,8 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
         LASSERT(!list_empty(&set->set_requests));
         list_for_each(tmp, &set->set_requests) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
-                (void)ptlrpc_send_new_req(req);
+                if (req->rq_phase == RQ_PHASE_NEW)
+                        (void)ptlrpc_send_new_req(req);
         }
 
         do {
@@ -981,6 +993,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
         }
 
         LASSERT(!request->rq_receiving_reply);
+        LASSERT(request->rq_rqbd == NULL);    /* client-side */
 
         /* We must take it off the imp_replay_list first.  Otherwise, we'll set
          * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
@@ -1073,67 +1086,39 @@ void ptlrpc_req_finished(struct ptlrpc_request *request)
  */
 void ptlrpc_unregister_reply (struct ptlrpc_request *request)
 {
-        unsigned long flags;
-        int           rc;
-        ENTRY;
+        int                rc;
+        wait_queue_head_t *wq;
+        struct l_wait_info lwi;
 
         LASSERT(!in_interrupt ());             /* might sleep */
 
-        spin_lock_irqsave (&request->rq_lock, flags);
-        if (!request->rq_receiving_reply) {     /* not waiting for a reply */
-                spin_unlock_irqrestore (&request->rq_lock, flags);
-                EXIT;
-                /* NB reply buffer not freed here */
+        if (!ptlrpc_client_receiving_reply(request))
                 return;
-        }
-
-        LASSERT(!request->rq_replied);         /* callback hasn't completed */
-        spin_unlock_irqrestore (&request->rq_lock, flags);
 
         rc = PtlMDUnlink (request->rq_reply_md_h);
-        switch (rc) {
-        default:
-                LBUG ();
-
-        case PTL_OK:                            /* unlinked before completion */
-                LASSERT(request->rq_receiving_reply);
-                LASSERT(!request->rq_replied);
-                spin_lock_irqsave (&request->rq_lock, flags);
-                request->rq_receiving_reply = 0;
-                spin_unlock_irqrestore (&request->rq_lock, flags);
-                OBD_FREE(request->rq_repmsg, request->rq_replen);
-                request->rq_repmsg = NULL;
-                EXIT;
+        if (rc == PTL_INV_MD) {
+                LASSERT (!ptlrpc_client_receiving_reply(request));
                 return;
+        }
+        
+        LASSERT (rc == PTL_OK);
 
-        case PTL_MD_INUSE:                      /* callback in progress */
-                for (;;) {
-                        /* Network access will complete in finite time but
-                         * the timeout lets us CERROR for visibility */
-                        struct l_wait_info lwi = LWI_TIMEOUT(10*HZ, NULL, NULL);
-
-                        rc = l_wait_event (request->rq_reply_waitq,
-                                           request->rq_replied, &lwi);
-                        LASSERT(rc == 0 || rc == -ETIMEDOUT);
-                        if (rc == 0) {
-                                spin_lock_irqsave (&request->rq_lock, flags);
-                                /* Ensure the callback has completed scheduling
-                                 * me and taken its hands off the request */
-                                spin_unlock_irqrestore(&request->rq_lock,flags);
-                                break;
-                        }
-
-                        CERROR ("Unexpectedly long timeout: req %p\n", request);
-                }
-                /* fall through */
-
-        case PTL_INV_MD:                        /* callback completed */
-                LASSERT(!request->rq_receiving_reply);
-                LASSERT(request->rq_replied);
-                EXIT;
-                return;
+        if (request->rq_set == NULL)
+                wq = &request->rq_set->set_waitq;
+        else
+                wq = &request->rq_reply_waitq;
+
+        for (;;) {
+                /* Network access will complete in finite time but the HUGE
+                 * timeout lets us CWARN for visibility of sluggish NALs */
+                lwi = LWI_TIMEOUT(300 * HZ, NULL, NULL);
+                rc = l_wait_event (*wq, !ptlrpc_client_receiving_reply(request), &lwi);
+                if (rc == 0)
+                        return;
+
+                LASSERT (rc == -ETIMEDOUT);
+                DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout");
         }
-        /* Not Reached */
 }
 
 /* caller must hold imp->imp_lock */
@@ -1207,11 +1192,17 @@ void ptlrpc_resend_req(struct ptlrpc_request *req)
         spin_lock_irqsave (&req->rq_lock, flags);
         req->rq_resend = 1;
         req->rq_timedout = 0;
-        if (req->rq_set != NULL)
-                wake_up (&req->rq_set->set_waitq);
-        else
-                wake_up(&req->rq_reply_waitq);
+        if (req->rq_bulk) {
+                __u64 old_xid = req->rq_xid;
+                
+                /* ensure previous bulk fails */
+                req->rq_xid = ptlrpc_next_xid();
+                CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n",
+                       old_xid, req->rq_xid);
+        }
+        ptlrpc_wake_client_req(req);
         spin_unlock_irqrestore (&req->rq_lock, flags);
+
 }
 
 /* XXX: this function and rq_status are currently unused */
@@ -1225,10 +1216,7 @@ void ptlrpc_restart_req(struct ptlrpc_request *req)
         spin_lock_irqsave (&req->rq_lock, flags);
         req->rq_restart = 1;
         req->rq_timedout = 0;
-        if (req->rq_set != NULL)
-                wake_up (&req->rq_set->set_waitq);
-        else
-                wake_up(&req->rq_reply_waitq);
+        ptlrpc_wake_client_req(req);
         spin_unlock_irqrestore (&req->rq_lock, flags);
 }
 
@@ -1456,15 +1444,24 @@ restart:
 
  out:
         if (req->rq_bulk != NULL) {
-                if (rc >= 0) {                  /* success so far */
+                if (rc >= 0) {                  
+                        /* success so far.  Note that anything going wrong
+                         * with bulk now, is EXTREMELY strange, since the
+                         * server must have believed that the bulk
+                         * tranferred OK before she replied with success to
+                         * me. */
                         lwi = LWI_TIMEOUT(timeout, NULL, NULL);
                         brc = l_wait_event(req->rq_reply_waitq,
-                                           ptlrpc_bulk_complete(req->rq_bulk),
+                                           !ptlrpc_bulk_active(req->rq_bulk),
                                            &lwi);
+                        LASSERT(brc == 0 || brc == -ETIMEDOUT);
                         if (brc != 0) {
                                 LASSERT(brc == -ETIMEDOUT);
-                                CERROR ("Timed out waiting for bulk\n");
+                                DEBUG_REQ(D_ERROR, req, "bulk timed out");
                                 rc = brc;
+                        } else if (!req->rq_bulk->bd_success) {
+                                DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+                                rc = -EIO;
                         }
                 }
                 if (rc < 0)
@@ -1499,7 +1496,8 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req,
         /* Clear reply swab mask; this is a new reply in sender's byte order */
         req->rq_rep_swab_mask = 0;
 #endif
-        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
+        LASSERT (req->rq_nob_received <= req->rq_replen);
+        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
         if (rc) {
                 CERROR("unpack_rep failed: %d\n", rc);
                 GOTO(out, rc = -EPROTO);
@@ -1607,10 +1605,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
                 spin_lock (&req->rq_lock);
                 if (req->rq_import_generation < imp->imp_generation) {
                         req->rq_err = 1;
-                        if (req->rq_set != NULL)
-                                wake_up(&req->rq_set->set_waitq);
-                        else
-                                wake_up(&req->rq_reply_waitq);
+                        ptlrpc_wake_client_req(req);
                 }
                 spin_unlock (&req->rq_lock);
         }
@@ -1624,10 +1619,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
                 spin_lock (&req->rq_lock);
                 if (req->rq_import_generation < imp->imp_generation) {
                         req->rq_err = 1;
-                        if (req->rq_set != NULL)
-                                wake_up(&req->rq_set->set_waitq);
-                        else
-                                wake_up(&req->rq_reply_waitq);
+                        ptlrpc_wake_client_req(req);
                 }
                 spin_unlock (&req->rq_lock);
         }
index e91d7a3..b2aa6b9 100644 (file)
 struct ptlrpc_ni  ptlrpc_interfaces[NAL_MAX_NR];
 int               ptlrpc_ninterfaces;
 
-/*
- *  Free the packet when it has gone out
+/*  
+ *  Client's outgoing request callback
  */
-static int request_out_callback(ptl_event_t *ev)
+void request_out_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_request *req = ev->mem_desc.user_ptr;
+        struct ptlrpc_cb_id   *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_request *req = cbid->cbid_arg;
+        unsigned long          flags;
         ENTRY;
 
-        /* requests always contiguous */
-        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
-
-        if (ev->type != PTL_EVENT_SENT) {
-                // XXX make sure we understand all events, including ACK's
-                CERROR("Unknown event %d\n", ev->type);
-                LBUG();
-        }
+        LASSERT (ev->type == PTL_EVENT_SENT ||
+                 ev->type == PTL_EVENT_UNLINK);
+        LASSERT (ev->unlinked);
 
-        /* this balances the atomic_inc in ptl_send_rpc() */
-        ptlrpc_req_finished(req);
-        RETURN(1);
-}
+        DEBUG_REQ((ev->status == PTL_OK) ? D_NET : D_ERROR, req,
+                  "type %d, status %d", ev->type, ev->status);
 
-/*
- *  Free the packet when it has gone out
- */
-static int reply_out_callback(ptl_event_t *ev)
-{
-        struct ptlrpc_request *req = ev->mem_desc.user_ptr;
-        unsigned long          flags;
-        ENTRY;
+        if (ev->type == PTL_EVENT_UNLINK ||
+            ev->status != PTL_OK) {
 
-        /* replies always contiguous */
-        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
+                /* Failed send: make it seem like the reply timed out, just
+                 * like failing sends in client.c does currently...  */
 
-        if (ev->type == PTL_EVENT_SENT) {
-                /* NB don't even know if this is the current reply! In fact
-                 * we can't touch any state in the request, since the
-                 * service handler zeros it on each incoming request. */
-                OBD_FREE(ev->mem_desc.start, ev->mem_desc.length);
-        } else if (ev->type == PTL_EVENT_ACK) {
-                LASSERT(req->rq_want_ack);
                 spin_lock_irqsave(&req->rq_lock, flags);
-                req->rq_want_ack = 0;
-                wake_up(&req->rq_reply_waitq);
+                req->rq_timeout = 0;
                 spin_unlock_irqrestore(&req->rq_lock, flags);
-        } else {
-                // XXX make sure we understand all events
-                CERROR("Unknown event %d\n", ev->type);
-                LBUG();
+                
+                ptlrpc_wake_client_req(req);
         }
 
-        RETURN(1);
+        /* this balances the atomic_inc in ptl_send_rpc() */
+        ptlrpc_req_finished(req);
+        EXIT;
 }
 
 /*
- * Wake up the thread waiting for the reply once it comes in.
+ * Client's incoming reply callback
  */
-int reply_in_callback(ptl_event_t *ev)
+void reply_in_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_request *req = ev->mem_desc.user_ptr;
+        struct ptlrpc_cb_id   *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_request *req = cbid->cbid_arg;
         unsigned long flags;
         ENTRY;
 
-        /* replies always contiguous */
-        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
-
-        if (req->rq_xid == 0x5a5a5a5a5a5a5a5aULL) {
-                CERROR("Reply received for freed request!  Probably a missing "
-                       "ptlrpc_abort()\n");
-                LBUG();
-        }
+        LASSERT (ev->type == PTL_EVENT_PUT ||
+                 ev->type == PTL_EVENT_UNLINK);
+        LASSERT (ev->unlinked);
+        LASSERT (ev->mem_desc.start == req->rq_repmsg);
+        LASSERT (ev->offset == 0);
+        LASSERT (ev->mlength <= req->rq_replen);
+        
+        DEBUG_REQ((ev->status == PTL_OK) ? D_NET : D_ERROR, req,
+                  "type %d, status %d", ev->type, ev->status);
 
-        if (req->rq_xid != ev->match_bits) {
-                CERROR("Reply packet for wrong request\n");
-                LBUG();
-        }
+        spin_lock_irqsave (&req->rq_lock, flags);
 
-        if (ev->type == PTL_EVENT_PUT) {
-                /* Bug 1190: should handle non-zero offset as a protocol
-                 * error  */
-                LASSERT (ev->offset == 0);
+        LASSERT (req->rq_receiving_reply);
+        req->rq_receiving_reply = 0;
 
-                spin_lock_irqsave (&req->rq_lock, flags);
-                LASSERT (req->rq_receiving_reply);
-                req->rq_receiving_reply = 0;
+        if (ev->type == PTL_EVENT_PUT &&
+            ev->status == PTL_OK) {
                 req->rq_replied = 1;
-                if (req->rq_set != NULL)
-                        wake_up(&req->rq_set->set_waitq);
-                else
-                        wake_up(&req->rq_reply_waitq);
-                spin_unlock_irqrestore (&req->rq_lock, flags);
-        } else {
-                // XXX make sure we understand all events, including ACKs
-                CERROR("Unknown event %d\n", ev->type);
-                LBUG();
-        }
-
-        RETURN(1);
-}
-
-int request_in_callback(ptl_event_t *ev)
-{
-        struct ptlrpc_request_buffer_desc *rqbd = ev->mem_desc.user_ptr;
-        struct ptlrpc_srv_ni  *srv_ni = rqbd->rqbd_srv_ni;
-        struct ptlrpc_service *service = srv_ni->sni_service;
-
-        /* requests always contiguous */
-        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
-        /* we only enable puts */
-        LASSERT(ev->type == PTL_EVENT_PUT);
-        LASSERT(atomic_read(&srv_ni->sni_nrqbds_receiving) > 0);
-        LASSERT(atomic_read(&rqbd->rqbd_refcount) > 0);
-
-        if (ev->rlength != ev->mlength)
-                CERROR("Warning: Possibly truncated rpc (%d/%d)\n",
-                       ev->mlength, ev->rlength);
-
-        if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) {
-                /* This is the last request to be received into this
-                 * request buffer.  We don't bump the refcount, since the
-                 * thread servicing this event is effectively taking over
-                 * portals' reference.
-                 */
-                /* NB ev->unlinked_me.nal_idx is not set properly in a callback */
-                LASSERT(ev->unlinked_me.cookie==rqbd->rqbd_me_h.cookie);
-
-                /* we're off the air */
-                /* we'll probably start dropping packets in portals soon */
-                if (atomic_dec_and_test(&srv_ni->sni_nrqbds_receiving))
-                        CERROR("All request buffers busy\n");
-        } else {
-                /* +1 ref for service thread */
-                atomic_inc(&rqbd->rqbd_refcount);
+                req->rq_nob_received = ev->mlength;
         }
 
-        wake_up(&service->srv_waitq);
+        /* NB don't unlock till after wakeup; req can disappear under us
+         * since we don't have our own ref */
+        ptlrpc_wake_client_req(req);
 
-        return 0;
+        spin_unlock_irqrestore (&req->rq_lock, flags);
+        EXIT;
 }
 
-static int bulk_put_source_callback(ptl_event_t *ev)
+/* 
+ * Client's bulk has been written/read
+ */
+void client_bulk_callback (ptl_event_t *ev)
 {
+        struct ptlrpc_cb_id     *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
         unsigned long            flags;
-        struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
         ENTRY;
 
-        CDEBUG(D_NET, "got %s event %d\n",
-               (ev->type == PTL_EVENT_SENT) ? "SENT" :
-               (ev->type == PTL_EVENT_ACK)  ? "ACK"  : "UNEXPECTED", ev->type);
+        LASSERT ((desc->bd_type == BULK_PUT_SINK && 
+                  ev->type == PTL_EVENT_PUT) ||
+                 (desc->bd_type == BULK_GET_SOURCE &&
+                  ev->type == PTL_EVENT_GET) ||
+                 ev->type == PTL_EVENT_UNLINK);
+        LASSERT (ev->unlinked);
 
-        LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_ACK);
-
-        /* 1 fragment for each page always */
-        LASSERT(ev->mem_desc.niov == desc->bd_page_count);
+        CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR,
+               "event type %d, status %d, desc %p\n", 
+               ev->type, ev->status, desc);
 
         spin_lock_irqsave (&desc->bd_lock, flags);
-        
-        LASSERT(desc->bd_callback_count > 0 &&
-                desc->bd_callback_count <= 2);
-        
-        if (--desc->bd_callback_count == 0) {
-                desc->bd_network_rw = 0;
-                desc->bd_complete = 1;
-                wake_up(&desc->bd_waitq);
+
+        LASSERT(desc->bd_network_rw);
+        desc->bd_network_rw = 0;
+
+        if (ev->type != PTL_EVENT_UNLINK &&
+            ev->status == PTL_OK) {
+                desc->bd_success = 1;
+                desc->bd_nob_transferred = ev->mlength;
         }
 
+        /* NB don't unlock till after wakeup; desc can disappear under us
+         * otherwise */
+        ptlrpc_wake_client_req(desc->bd_req);
+
         spin_unlock_irqrestore (&desc->bd_lock, flags);
-        RETURN(0);
+        EXIT;
 }
 
-struct ptlrpc_bulk_desc ptlrpc_bad_desc;
-ptl_event_t ptlrpc_bad_event;
-
-static int bulk_put_sink_callback(ptl_event_t *ev)
+/* 
+ * Server's incoming request callback
+ */
+void request_in_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
-        unsigned long            flags;
+        struct ptlrpc_cb_id               *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
+        struct ptlrpc_srv_ni              *srv_ni = rqbd->rqbd_srv_ni;
+        struct ptlrpc_service             *service = srv_ni->sni_service;
+        struct ptlrpc_request             *req;
+        long                               flags;
         ENTRY;
 
-        LASSERT(ev->type == PTL_EVENT_PUT);
-
-        /* used iovs */
-        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) ==
-                PTL_MD_KIOV);
-        /* Honestly, it's best to find out early. */
-        if (desc->bd_page_count == 0x5a5a5a5a ||
-            desc->bd_page_count != ev->mem_desc.niov ||
-            ev->mem_desc.start != &desc->bd_iov) {
-                /* not guaranteed (don't LASSERT) but good for this bug hunt */
-                ptlrpc_bad_event = *ev;
-                ptlrpc_bad_desc = *desc;
-                CERROR ("XXX ev %p type %d portal %d match "LPX64", seq %ld\n",
-                        ev, ev->type, ev->portal, ev->match_bits, ev->sequence);
-                CERROR ("XXX desc %p, export %p import %p gen %d "
-                        " portal %d\n", 
-                        desc, desc->bd_export,
-                        desc->bd_import, desc->bd_import_generation,
-                        desc->bd_portal);
-                RETURN (0);
+        LASSERT (ev->type == PTL_EVENT_PUT ||
+                 ev->type == PTL_EVENT_UNLINK);
+        LASSERT ((char *)ev->mem_desc.start >= rqbd->rqbd_buffer);
+        LASSERT ((char *)ev->mem_desc.start + ev->offset + ev->mlength <=
+                 rqbd->rqbd_buffer + service->srv_buf_size);
+
+        CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR,
+               "event type %d, status %d, service %s\n", 
+               ev->type, ev->status, service->srv_name);
+
+        if (ev->unlinked) {
+                /* If this is the last request message to fit in the
+                 * request buffer we can use the request object embedded in
+                 * rqbd.  Note that if we failed to allocate a request,
+                 * we'd have to re-post the rqbd, which we can't do in this
+                 * context. */
+                req = &rqbd->rqbd_req;
+                memset(req, 0, sizeof (*req));
+        } else {
+                LASSERT (ev->type == PTL_EVENT_PUT);
+                if (ev->status != PTL_OK) {
+                        /* We moaned above already... */
+                        return;
+                }
+                OBD_ALLOC_GFP(req, sizeof(*req), GFP_ATOMIC);
+                if (req == NULL) {
+                        CERROR("Can't allocate incoming request descriptor: "
+                               "Dropping %s RPC from "LPX64"\n",
+                               service->srv_name, ev->initiator.nid);
+                        return;
+                }
         }
-        
-        LASSERT(desc->bd_page_count != 0x5a5a5a5a);
-        /* 1 fragment for each page always */
-        LASSERT(ev->mem_desc.niov == desc->bd_page_count);
-        LASSERT(ev->match_bits == desc->bd_req->rq_xid);
-        
-        /* peer must put with zero offset */
-        if (ev->offset != 0) {
-                /* Bug 1190: handle this as a protocol failure */
-                CERROR ("Bad offset %d\n", ev->offset);
-                LBUG ();
+
+        /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
+         * flags are reset and scalars are zero.  We only set the message
+         * size to non-zero if this was a successful receive. */
+        req->rq_xid = ev->match_bits;
+        req->rq_reqmsg = ev->mem_desc.start + ev->offset;
+        if (ev->type == PTL_EVENT_PUT &&
+            ev->status == PTL_OK)
+                req->rq_reqlen = ev->mlength;
+        req->rq_arrival_time = ev->arrival_time;
+        req->rq_peer.peer_nid = ev->initiator.nid;
+        req->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni;
+        req->rq_rqbd = rqbd;
+
+        spin_lock_irqsave (&service->srv_lock, flags);
+
+        if (ev->unlinked) {
+                srv_ni->sni_nrqbd_receiving--;
+                if (ev->type != PTL_EVENT_UNLINK &&
+                    srv_ni->sni_nrqbd_receiving == 0) {
+                        /* This service is off-air on this interface because
+                         * all its request buffers are busy.  Portals will
+                         * start dropping incoming requests until more buffers
+                         * get posted.  NB don't moan if it's because we're
+                         * tearing down the service. */
+                        CWARN("All %s %s request buffers busy\n",
+                              service->srv_name, srv_ni->sni_ni->pni_name);
+                }
+                /* req takes over the network's ref on rqbd */
+        } else {
+                /* req takes a ref on rqbd */
+                rqbd->rqbd_refcount++;
         }
 
-        /* No check for total # bytes; this could be a short read */
+        list_add_tail(&req->rq_list, &service->srv_request_queue);
+        service->srv_n_queued_reqs++;
+        rqbd->rqbd_eventcount++;
 
-        spin_lock_irqsave (&desc->bd_lock, flags);
-        desc->bd_network_rw = 0;
-        desc->bd_complete = 1;
-        if (desc->bd_req->rq_set != NULL)
-                wake_up (&desc->bd_req->rq_set->set_waitq);
-        else
-                wake_up (&desc->bd_req->rq_reply_waitq);
-        spin_unlock_irqrestore (&desc->bd_lock, flags);
+        /* NB everything can disappear under us once the request
+         * has been queued and we unlock, so do the wake now... */
+        wake_up(&service->srv_waitq);
 
-        RETURN(1);
+        spin_unlock_irqrestore(&service->srv_lock, flags);
+        EXIT;
 }
 
-static int bulk_get_source_callback(ptl_event_t *ev)
+/*  
+ *  Server's outgoing reply callback
+ */
+void reply_out_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
-        struct ptlrpc_bulk_page *bulk;
-        struct list_head        *tmp;
-        unsigned long            flags;
-        ptl_size_t               total = 0;
+        struct ptlrpc_cb_id       *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_reply_state *rs = cbid->cbid_arg;
+        struct ptlrpc_srv_ni      *sni = rs->rs_srv_ni;
+        struct ptlrpc_service     *svc = sni->sni_service;
+        unsigned long              flags;
         ENTRY;
 
-        LASSERT(ev->type == PTL_EVENT_GET);
-
-        /* used iovs */
-        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) ==
-                PTL_MD_KIOV);
-        /* 1 fragment for each page always */
-        LASSERT(ev->mem_desc.niov == desc->bd_page_count);
-        LASSERT(ev->match_bits == desc->bd_req->rq_xid);
-
-        /* peer must get with zero offset */
-        if (ev->offset != 0) {
-                /* Bug 1190: handle this as a protocol failure */
-                CERROR ("Bad offset %d\n", ev->offset);
-                LBUG ();
+        LASSERT (ev->type == PTL_EVENT_SENT ||
+                 ev->type == PTL_EVENT_ACK ||
+                 ev->type == PTL_EVENT_UNLINK);
+
+        if (!rs->rs_difficult) {
+                /* I'm totally responsible for freeing "easy" replies */
+                LASSERT (ev->unlinked);
+                lustre_free_reply_state (rs);
+                atomic_dec (&svc->srv_outstanding_replies);
+                EXIT;
+                return;
         }
-        
-        list_for_each (tmp, &desc->bd_page_list) {
-                bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
 
-                total += bulk->bp_buflen;
-        }
+        LASSERT (rs->rs_on_net);
 
-        /* peer must get everything */
-        if (ev->mem_desc.length != total) {
-                /* Bug 1190: handle this as a protocol failure */
-                CERROR ("Bad length/total %d/%d\n", ev->mem_desc.length, total);
-                LBUG ();
+        if (ev->unlinked) {
+                /* Last network callback */
+                spin_lock_irqsave (&svc->srv_lock, flags);
+                rs->rs_on_net = 0;
+                ptlrpc_schedule_difficult_reply (rs);
+                spin_unlock_irqrestore (&svc->srv_lock, flags);
         }
 
-        spin_lock_irqsave (&desc->bd_lock, flags);
-        desc->bd_network_rw = 0;
-        desc->bd_complete = 1;
-        if (desc->bd_req->rq_set != NULL)
-                wake_up (&desc->bd_req->rq_set->set_waitq);
-        else
-                wake_up (&desc->bd_req->rq_reply_waitq);
-        spin_unlock_irqrestore (&desc->bd_lock, flags);
-
-        RETURN(1);
+        EXIT;
 }
 
-static int bulk_get_sink_callback(ptl_event_t *ev)
+/*
+ * Server's bulk completion callback
+ */
+void server_bulk_callback (ptl_event_t *ev)
 {
-        struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
+        struct ptlrpc_cb_id     *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
         unsigned long            flags;
         ENTRY;
 
-        CDEBUG(D_NET, "got %s event %d desc %p\n",
-               (ev->type == PTL_EVENT_SENT) ? "SENT" :
-               (ev->type == PTL_EVENT_REPLY)  ? "REPLY"  : "UNEXPECTED",
-               ev->type, desc);
+        LASSERT (ev->type == PTL_EVENT_SENT ||
+                 ev->type == PTL_EVENT_UNLINK ||
+                 (desc->bd_type == BULK_PUT_SOURCE &&
+                  ev->type == PTL_EVENT_ACK) ||
+                 (desc->bd_type == BULK_GET_SINK &&
+                  ev->type == PTL_EVENT_REPLY));
 
-        LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_REPLY);
-
-        /* 1 fragment for each page always */
-        LASSERT(ev->mem_desc.niov == desc->bd_page_count);
+        CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR,
+               "event type %d, status %d, desc %p\n", 
+               ev->type, ev->status, desc);
 
         spin_lock_irqsave (&desc->bd_lock, flags);
-        LASSERT(desc->bd_callback_count > 0 &&
-                desc->bd_callback_count <= 2);
+        
+        if ((ev->type == PTL_EVENT_ACK ||
+             ev->type == PTL_EVENT_REPLY) &&
+            ev->status == PTL_OK) {
+                /* We heard back from the peer, so even if we get this
+                 * before the SENT event (oh yes we can), we know we
+                 * read/wrote the peer buffer and how much... */
+                desc->bd_success = 1;
+                desc->bd_nob_transferred = ev->mlength;
+        }
 
-        if (--desc->bd_callback_count == 0) {
+        if (ev->unlinked) {
+                /* This is the last callback no matter what... */
                 desc->bd_network_rw = 0;
-                desc->bd_complete = 1;
                 wake_up(&desc->bd_waitq);
         }
+
         spin_unlock_irqrestore (&desc->bd_lock, flags);
+        EXIT;
+}
+
+static int ptlrpc_master_callback(ptl_event_t *ev)
+{
+        struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr;
+        void (*callback)(ptl_event_t *ev) = cbid->cbid_fn;
 
-        RETURN(0);
+        /* Honestly, it's best to find out early. */
+        LASSERT (cbid->cbid_arg != (void *)0x5a5a5a5a5a5a5a5a);
+        LASSERT (callback == request_out_callback ||
+                 callback == reply_in_callback ||
+                 callback == client_bulk_callback ||
+                 callback == request_in_callback ||
+                 callback == reply_out_callback ||
+                 callback == server_bulk_callback);
+        
+        callback (ev);
+        return (0);
 }
 
 int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer)
@@ -368,14 +365,7 @@ int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer)
 
 void ptlrpc_ni_fini(struct ptlrpc_ni *pni)
 {
-        PtlEQFree(pni->pni_request_out_eq_h);
-        PtlEQFree(pni->pni_reply_out_eq_h);
-        PtlEQFree(pni->pni_reply_in_eq_h);
-        PtlEQFree(pni->pni_bulk_put_source_eq_h);
-        PtlEQFree(pni->pni_bulk_put_sink_eq_h);
-        PtlEQFree(pni->pni_bulk_get_source_eq_h);
-        PtlEQFree(pni->pni_bulk_get_sink_eq_h);
-
+        PtlEQFree(pni->pni_eq_h);
         kportal_put_ni (pni->pni_number);
 }
 
@@ -395,51 +385,18 @@ int ptlrpc_ni_init(int number, char *name, struct ptlrpc_ni *pni)
         pni->pni_number = number;
         pni->pni_ni_h = *nip;
 
-        pni->pni_request_out_eq_h = PTL_HANDLE_NONE;
-        pni->pni_reply_out_eq_h = PTL_HANDLE_NONE;
-        pni->pni_reply_in_eq_h = PTL_HANDLE_NONE;
-        pni->pni_bulk_put_source_eq_h = PTL_HANDLE_NONE;
-        pni->pni_bulk_put_sink_eq_h = PTL_HANDLE_NONE;
-        pni->pni_bulk_get_source_eq_h = PTL_HANDLE_NONE;
-        pni->pni_bulk_get_sink_eq_h = PTL_HANDLE_NONE;
-
-        /* NB We never actually PtlEQGet() out of these events queues since
-         * we're only interested in the event callback, so we can just let
-         * them wrap.  Their sizes aren't a big deal, apart from providing
-         * a little history for debugging... */
-
-        rc = PtlEQAlloc(pni->pni_ni_h, 1024, request_out_callback,
-                        &pni->pni_request_out_eq_h);
-        if (rc != PTL_OK)
-                GOTO (fail, rc = -ENOMEM);
-
-        rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_out_callback,
-                        &pni->pni_reply_out_eq_h);
-        if (rc != PTL_OK)
-                GOTO (fail, rc = -ENOMEM);
-
-        rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_in_callback,
-                        &pni->pni_reply_in_eq_h);
-        if (rc != PTL_OK)
-                GOTO (fail, rc = -ENOMEM);
-
-        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_source_callback,
-                        &pni->pni_bulk_put_source_eq_h);
-        if (rc != PTL_OK)
-                GOTO (fail, rc = -ENOMEM);
+        pni->pni_eq_h = PTL_HANDLE_NONE;
 
-        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_sink_callback,
-                        &pni->pni_bulk_put_sink_eq_h);
-        if (rc != PTL_OK)
-                GOTO (fail, rc = -ENOMEM);
-
-        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_source_callback,
-                        &pni->pni_bulk_get_source_eq_h);
-        if (rc != PTL_OK)
-                GOTO (fail, rc = -ENOMEM);
-
-        rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_sink_callback,
-                        &pni->pni_bulk_get_sink_eq_h);
+#ifdef __KERNEL__
+        /* kernel: portals calls the callback when the event is added to the
+         * queue, so we don't care if we lose events */
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, ptlrpc_master_callback,
+                        &pni->pni_eq_h);
+#else
+        /* liblustre: no asynchronous callback and allocate a nice big event
+         * queue so we don't drop any events... */
+        rc = PtlEQAlloc(pni->pni_ni_h, 10240, NULL, &pni->pni_eq_h);
+#endif
         if (rc != PTL_OK)
                 GOTO (fail, rc = -ENOMEM);
 
@@ -454,18 +411,42 @@ int ptlrpc_ni_init(int number, char *name, struct ptlrpc_ni *pni)
 }
 
 #ifndef __KERNEL__
+LIST_HEAD(liblustre_wait_callbacks);
+void *liblustre_services_callback;
+
+void *
+liblustre_register_wait_callback (int (*fn)(void *arg), void *arg)
+{
+        struct liblustre_wait_callback *llwc;
+        
+        OBD_ALLOC(llwc, sizeof(*llwc));
+        LASSERT (llwc != NULL);
+        
+        llwc->llwc_fn = fn;
+        llwc->llwc_arg = arg;
+        list_add_tail(&llwc->llwc_list, &liblustre_wait_callbacks);
+        
+        return (llwc);
+}
+
+void
+liblustre_deregister_wait_callback (void *opaque)
+{
+        struct liblustre_wait_callback *llwc = opaque;
+        
+        list_del(&llwc->llwc_list);
+        OBD_FREE(llwc, sizeof(*llwc));
+}
+
 int
-liblustre_check_events (int block)
+liblustre_check_events (int timeout)
 {
         ptl_event_t ev;
         int         rc;
         ENTRY;
 
-        if (block) {
-                /* XXX to accelerate recovery tests XXX */
-                if (block > 10)
-                        block = 10;
-                rc = PtlEQWait_timeout(ptlrpc_interfaces[0].pni_eq_h, &ev, block);
+        if (timeout) {
+                rc = PtlEQWait_timeout(ptlrpc_interfaces[0].pni_eq_h, &ev, timeout);
         } else {
                 rc = PtlEQGet (ptlrpc_interfaces[0].pni_eq_h, &ev);
         }
@@ -474,36 +455,58 @@ liblustre_check_events (int block)
         
         LASSERT (rc == PTL_EQ_DROPPED || rc == PTL_OK);
         
-#if PORTALS_DOES_NOT_SUPPORT_CALLBACKS
-        if (rc == PTL_EQ_DROPPED)
+#ifndef __KERNEL__
+        /* liblustre: no asynch callback so we can't affort to miss any
+         * events... */
+        if (rc == PTL_EQ_DROPPED) {
                 CERROR ("Dropped an event!!!\n");
+                abort();
+        }
         
         ptlrpc_master_callback (&ev);
 #endif
         RETURN(1);
 }
 
-int liblustre_wait_event(struct l_wait_info *lwi) 
+int
+liblustre_wait_event (int timeout)
 {
-        ENTRY;
-
-        /* non-blocking checks (actually we might block in a service for
-         * bulk but we won't block in a blocked service)
-         */
-        if (liblustre_check_events(0) ||
-            liblustre_check_services()) {
-                /* the condition the caller is waiting for may now hold */
-                RETURN(0);
+        struct list_head               *tmp;
+        struct liblustre_wait_callback *llwc;
+        int                             found_something = 0;
+
+        /* First check for any new events */
+        if (liblustre_check_events(0))
+                found_something = 1;
+
+        /* Now give all registered callbacks a bite at the cherry */
+        list_for_each(tmp, &liblustre_wait_callbacks) {
+                llwc = list_entry(tmp, struct liblustre_wait_callback, 
+                                  llwc_list);
+                
+                if (llwc->llwc_fn(llwc->llwc_arg))
+                        found_something = 1;
         }
-        
-        /* block for an event */
-        liblustre_check_events(lwi->lwi_timeout);
 
-        /* check it's not for some service */
-        liblustre_check_services ();
+        /* return to caller if something happened */
+        if (found_something)
+                return 1;
+        
+        /* block for an event, returning immediately on timeout */
+        if (!liblustre_check_events(timeout))
+                return 0;
+
+        /* an event occurred; let all registered callbacks progress... */
+        list_for_each(tmp, &liblustre_wait_callbacks) {
+                llwc = list_entry(tmp, struct liblustre_wait_callback, 
+                                  llwc_list);
+                
+                if (llwc->llwc_fn(llwc->llwc_arg))
+                        found_something = 1;
+        }
 
-        /* XXX check this */
-        RETURN(0);
+        /* ...and tell caller something happened */
+        return 1;
 }
 #endif
 
@@ -541,11 +544,18 @@ int ptlrpc_init_portals(void)
                        "loaded?\n");
                 return -EIO;
         }
+#ifndef __KERNEL__
+        liblustre_services_callback = 
+                liblustre_register_wait_callback(&liblustre_check_services, NULL);
+#endif
         return 0;
 }
 
 void ptlrpc_exit_portals(void)
 {
+#ifndef __KERNEL__
+        liblustre_deregister_wait_callback(liblustre_services_callback);
+#endif
         while (ptlrpc_ninterfaces > 0)
                 ptlrpc_ni_fini (&ptlrpc_interfaces[--ptlrpc_ninterfaces]);
 }
index 2fd25ec..5bc9e3f 100644 (file)
@@ -215,6 +215,10 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
         if (!request)
                 GOTO(out, rc = -ENOMEM);
 
+#ifndef __KERNEL__
+        lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT);
+#endif
+
         request->rq_send_state = LUSTRE_IMP_CONNECTING;
         request->rq_replen = lustre_msg_size(0, NULL);
         request->rq_interpret_reply = ptlrpc_connect_interpret;
@@ -229,6 +233,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
 
         if (aa->pcaa_initial_connect)
                 imp->imp_replayable = 1;
+
         ptlrpcd_add_req(request);
         rc = 0;
 out:
@@ -349,8 +354,10 @@ finish:
  out:
         if (rc != 0) {
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
-                if (aa->pcaa_initial_connect && !imp->imp_initial_recov)
+                if (aa->pcaa_initial_connect && !imp->imp_initial_recov) {
+                        ptlrpc_set_import_active(imp, 0);
                         GOTO(norecov, rc);
+                }
                 CDEBUG(D_ERROR, 
                        "recovery of %s on %s failed (%d); restarting\n",
                        imp->imp_target_uuid.uuid,
index 7fc27fc..cdd70e2 100644 (file)
@@ -137,8 +137,7 @@ EXPORT_SYMBOL(llog_initiator_connect);
 #else /* !__KERNEL__ */
 
 int llog_origin_connect(struct llog_ctxt *ctxt, int count,
-                        struct llog_logid *logid,
-                        struct llog_ctxt_gen *gen)
+                        struct llog_logid *logid, struct llog_gen *gen)
 {
         return 0;
 }
index 4d9e68c..a15f67c 100644 (file)
 #define EXPORT_SYMTAB
 #endif
 
+#ifndef __KERNEL__
+#include <liblustre.h>
+#else
 #include <linux/fs.h>
+#endif
+
 #include <linux/obd_class.h>
 #include <linux/lustre_log.h>
 #include <linux/lustre_net.h>
 #include <portals/list.h>
 #include <linux/lustre_fsfilt.h>
 
+#ifdef __KERNEL__
+
 int llog_origin_handle_create(struct ptlrpc_request *req)
 {
         struct obd_export *exp = req->rq_export;
@@ -525,3 +532,31 @@ out_free:
         OBD_FREE(buf, buf_len);
         return rc;
 }
+
+#else /* !__KERNEL__ */
+int llog_origin_handle_create(struct ptlrpc_request *req)
+{
+        LBUG();
+        return 0;
+}
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+        LBUG();
+        return 0;
+}
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+        LBUG();
+        return 0;
+}
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+        LBUG();
+        return 0;
+}
+int llog_origin_handle_cancel(struct ptlrpc_request *req)
+{
+        LBUG();
+        return 0;
+}
+#endif
index 1559403..f783ebf 100644 (file)
@@ -107,8 +107,8 @@ void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
         struct proc_dir_entry *svc_procroot;
         struct lprocfs_stats *svc_stats;
         int i, rc;
-        unsigned int svc_counter_config = LPROCFS_CNTR_EXTERNALLOCK |
-                LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV;
+        unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX | 
+                                          LPROCFS_CNTR_STDDEV;
 
         LASSERT(*procroot_ret == NULL);
         LASSERT(*stats_ret == NULL);
@@ -123,19 +123,16 @@ void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
                         lprocfs_free_stats(svc_stats);
                         return;
                 }
-        } else 
+        } else {
                 svc_procroot = root;
+        }
 
         lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
                              svc_counter_config, "req_waittime", "usec");
-        /* Wait for b_eq branch
-        lprocfs_counter_init(svc_stats, PTLRPC_SVCEQDEPTH_CNTR,
-                             svc_counter_config, "svc_eqdepth", "reqs");
-         */
-        /* no stddev on idletime */
-        lprocfs_counter_init(svc_stats, PTLRPC_SVCIDLETIME_CNTR,
-                             (LPROCFS_CNTR_EXTERNALLOCK|LPROCFS_CNTR_AVGMINMAX),
-                             "svc_idletime", "usec");
+        lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR,
+                             svc_counter_config, "req_qdepth", "reqs");
+        lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
+                             svc_counter_config, "req_active", "reqs");
         for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
                 __u32 opcode = ll_rpc_opcode_table[i].opcode;
                 lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
@@ -159,14 +156,14 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
                                      struct ptlrpc_service *svc)
 {
         ptlrpc_lprocfs_register(entry, svc->srv_name,
-                                "stats", &svc->srv_procroot, 
+                                "stats", &svc->srv_procroot,
                                 &svc->srv_stats);
 }
 
 void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
 {
-        ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats", 
-                                &obddev->obd_svc_procroot, 
+        ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
+                                &obddev->obd_svc_procroot,
                                 &obddev->obd_svc_stats);
 }
 
index ab6684a..c25db89 100644 (file)
 #include <linux/obd.h>
 #include "ptlrpc_internal.h"
 
-static int ptl_send_buf(struct ptlrpc_request *request,
-                        struct ptlrpc_connection *conn, int portal)
+static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, 
+                         ptl_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+                         struct ptlrpc_connection *conn, int portal, __u64 xid)
 {
-        int rc;
-        int rc2;
         ptl_process_id_t remote_id;
-        ptl_handle_md_t md_h;
-        ptl_ack_req_t ack_req;
+        int              rc;
+        int              rc2;
+        ptl_md_t         md;
         char str[PTL_NALFMT_SIZE];
+        ENTRY;
 
         LASSERT (portal != 0);
         LASSERT (conn != NULL);
@@ -50,156 +51,82 @@ static int ptl_send_buf(struct ptlrpc_request *request,
                                 conn->c_peer.peer_nid, str),
                 conn->c_peer.peer_ni->pni_name);
 
-        request->rq_req_md.user_ptr = request;
-
-        switch (request->rq_type) {
-        case PTL_RPC_MSG_REQUEST:
-                request->rq_reqmsg->type = request->rq_type;
-                request->rq_req_md.start = request->rq_reqmsg;
-                request->rq_req_md.length = request->rq_reqlen;
-                request->rq_req_md.eventq =
-                        conn->c_peer.peer_ni->pni_request_out_eq_h;
-                LASSERT (!request->rq_want_ack);
-                break;
-        case PTL_RPC_MSG_ERR:
-        case PTL_RPC_MSG_REPLY:
-                request->rq_repmsg->type = request->rq_type;
-                request->rq_req_md.start = request->rq_repmsg;
-                request->rq_req_md.length = request->rq_replen;
-                request->rq_req_md.eventq =
-                        conn->c_peer.peer_ni->pni_reply_out_eq_h;
-                break;
-        default:
-                LBUG();
-                return -1; /* notreached */
-        }
-        if (request->rq_want_ack) {
-                request->rq_req_md.threshold = 2; /* SENT and ACK */
-                ack_req = PTL_ACK_REQ;
-        } else {
-                request->rq_req_md.threshold = 1;
-                ack_req = PTL_NOACK_REQ;
-        }
-        request->rq_req_md.options = PTL_MD_OP_PUT;
-        request->rq_req_md.user_ptr = request;
+        remote_id.nid = conn->c_peer.peer_nid,
+        remote_id.pid = 0;
 
-        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_ACK | OBD_FAIL_ONCE)) {
-                request->rq_req_md.options |= PTL_MD_ACK_DISABLE;
+        md.start     = base;
+        md.length    = len;
+        md.threshold = (ack == PTL_ACK_REQ) ? 2 : 1;
+        md.options   = 0;
+        md.user_ptr  = cbid;
+        md.eventq    = conn->c_peer.peer_ni->pni_eq_h;
+
+        if (ack == PTL_ACK_REQ &&
+            OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_ACK | OBD_FAIL_ONCE)) {
+                /* don't ask for the ack to simulate failing client */
+                ack = PTL_NOACK_REQ;
                 obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
         }
 
-        /* NB if the send fails, we back out of the send and return
-         * failure; it's down to the caller to handle missing callbacks */
-
-        rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h, request->rq_req_md,
-                       &md_h);
+        rc = PtlMDBind (conn->c_peer.peer_ni->pni_ni_h, md, mdh);
         if (rc != PTL_OK) {
-                CERROR("PtlMDBind failed: %d\n", rc);
+                CERROR ("PtlMDBind failed: %d\n", rc);
                 LASSERT (rc == PTL_NOSPACE);
                 RETURN (-ENOMEM);
         }
-        if (request->rq_type != PTL_RPC_MSG_REQUEST)
-                memcpy(&request->rq_reply_md_h, &md_h, sizeof(md_h));
-
-        remote_id.nid = conn->c_peer.peer_nid;
-        remote_id.pid = 0;
 
         CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n",
-               request->rq_req_md.length, portal, request->rq_xid);
+               len, portal, xid);
 
-        rc = PtlPut(md_h, ack_req, remote_id, portal, 0, request->rq_xid, 0, 0);
+        rc2 = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0);
         if (rc != PTL_OK) {
+                /* We're going to get an UNLINK event when I unlink below,
+                 * which will complete just like any other failed send, so
+                 * I fall through and return success here! */
                 CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n",
-                       remote_id.nid, portal, request->rq_xid, rc);
-                rc2 = PtlMDUnlink(md_h);
+                       remote_id.nid, portal, xid, rc);
+                rc2 = PtlMDUnlink(*mdh);
                 LASSERT (rc2 == PTL_OK);
-                RETURN ((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM);
         }
 
-        return 0;
+        RETURN (0);
 }
 
-static inline ptl_kiov_t *
-ptlrpc_get_bulk_iov (struct ptlrpc_bulk_desc *desc)
+int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc)
 {
-        ptl_kiov_t *iov;
-
-        if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (*iov))
-                return (desc->bd_iov);
-
-        OBD_ALLOC (iov, desc->bd_page_count * sizeof (*iov));
-        if (iov == NULL)
-                LBUG();
-
-        return (iov);
-}
-
-static inline void
-ptlrpc_put_bulk_iov (struct ptlrpc_bulk_desc *desc, ptl_kiov_t *iov)
-{
-        if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (*iov))
-                return;
-
-        OBD_FREE (iov, desc->bd_page_count * sizeof (*iov));
-}
-
-int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
-{
-        int rc;
-        int rc2;
+        int                 rc;
+        int                 rc2;
         struct ptlrpc_peer *peer;
-        struct list_head *tmp, *next;
-        ptl_process_id_t remote_id;
-        ptl_kiov_t *iov;
-        __u64 xid;
+        ptl_process_id_t    remote_id;
+        ptl_md_t            md;
+        __u64               xid;
         ENTRY;
 
         /* NB no locking required until desc is on the network */
         LASSERT (!desc->bd_network_rw);
-        LASSERT (desc->bd_type == BULK_PUT_SOURCE);
-        desc->bd_complete = 0;
-
-        iov = ptlrpc_get_bulk_iov (desc);
-        if (iov == NULL)
-                RETURN (-ENOMEM);
-
+        LASSERT (desc->bd_type == BULK_PUT_SOURCE ||
+                 desc->bd_type == BULK_GET_SINK);
+        desc->bd_success = 0;
         peer = &desc->bd_export->exp_connection->c_peer;
 
-        desc->bd_md.start = iov;
-        desc->bd_md.niov = 0;
-        desc->bd_md.length = 0;
-        desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_source_eq_h;
-        desc->bd_md.threshold = 2; /* SENT and ACK */
-        desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_KIOV;
-        desc->bd_md.user_ptr = desc;
-
-        desc->bd_callback_count = 2;
-
-        list_for_each_safe(tmp, next, &desc->bd_page_list) {
-                struct ptlrpc_bulk_page *bulk;
-                bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
-
-                LASSERT(desc->bd_md.niov < desc->bd_page_count);
-
-                iov[desc->bd_md.niov].kiov_page = bulk->bp_page;
-                iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset;
-                iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen;
-
-                LASSERT (iov[desc->bd_md.niov].kiov_offset +
-                         iov[desc->bd_md.niov].kiov_len <= PAGE_SIZE);
-                desc->bd_md.niov++;
-                desc->bd_md.length += bulk->bp_buflen;
-        }
+        md.start = &desc->bd_iov[0];
+        md.niov = desc->bd_page_count;
+        md.length = desc->bd_nob;
+        md.eventq = peer->peer_ni->pni_eq_h;
+        md.threshold = 2; /* SENT and ACK/REPLY */
+#ifdef __KERNEL__
+        md.options = PTL_MD_KIOV;
+#else
+        md.options = PTL_MD_IOV;
+#endif
+        md.user_ptr = &desc->bd_cbid;
+        LASSERT (desc->bd_cbid.cbid_fn == server_bulk_callback);
+        LASSERT (desc->bd_cbid.cbid_arg == desc);
 
         /* NB total length may be 0 for a read past EOF, so we send a 0
          * length bulk, since the client expects a bulk event. */
-        LASSERT(desc->bd_md.niov == desc->bd_page_count);
-
-        rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md,
-                       &desc->bd_md_h);
-
-        ptlrpc_put_bulk_iov (desc, iov); /*move down to reduce latency to send*/
 
+        rc = PtlMDBind(peer->peer_ni->pni_ni_h, md, &desc->bd_md_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMDBind failed: %d\n", rc);
                 LASSERT (rc == PTL_NOSPACE);
@@ -211,109 +138,29 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
         remote_id.nid = peer->peer_nid;
         remote_id.pid = 0;
 
-        CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d on %s "
-               "nid "LPX64" pid %d xid "LPX64"\n",
-               desc->bd_md.niov, desc->bd_md.length,
-               desc->bd_portal, peer->peer_ni->pni_name,
+        CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d on %s "
+               "nid "LPX64" pid %d xid "LPX64"\n", 
+               md.niov, md.length, desc->bd_portal, peer->peer_ni->pni_name,
                remote_id.nid, remote_id.pid, xid);
 
+        /* Network is about to get at the memory */
         desc->bd_network_rw = 1;
-        rc = PtlPut(desc->bd_md_h, PTL_ACK_REQ, remote_id,
-                    desc->bd_portal, 0, xid, 0, 0);
-        if (rc != PTL_OK) {
-                desc->bd_network_rw = 0;
-                CERROR("PtlPut("LPU64", %d, "LPX64") failed: %d\n",
-                       remote_id.nid, desc->bd_portal, xid, rc);
-                rc2 = PtlMDUnlink(desc->bd_md_h);
-                LASSERT (rc2 == PTL_OK);
-                RETURN((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM);
-        }
-
-        RETURN(0);
-}
-
-int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
-{
-        int rc;
-        int rc2;
-        struct ptlrpc_peer *peer;
-        struct list_head *tmp, *next;
-        ptl_process_id_t remote_id;
-        ptl_kiov_t *iov;
-        __u64 xid;
-        ENTRY;
-
-        /* NB no locking required until desc is on the network */
-        LASSERT (!desc->bd_network_rw);
-        LASSERT (desc->bd_type == BULK_GET_SINK);
-        desc->bd_complete = 0;
-
-        iov = ptlrpc_get_bulk_iov (desc);
-        if (iov == NULL)
-                RETURN(-ENOMEM);
-
-        peer = &desc->bd_export->exp_connection->c_peer;
-
-        desc->bd_md.start = iov;
-        desc->bd_md.niov = 0;
-        desc->bd_md.length = 0;
-        desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_sink_eq_h;
-        desc->bd_md.threshold = 2; /* SENT and REPLY */
-        desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_KIOV;
-        desc->bd_md.user_ptr = desc;
-
-        desc->bd_callback_count = 2;
-
-        list_for_each_safe(tmp, next, &desc->bd_page_list) {
-                struct ptlrpc_bulk_page *bulk;
-                bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
 
-                LASSERT(desc->bd_md.niov < desc->bd_page_count);
-
-                iov[desc->bd_md.niov].kiov_page = bulk->bp_page;
-                iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen;
-                iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset;
-
-                LASSERT (iov[desc->bd_md.niov].kiov_offset +
-                         iov[desc->bd_md.niov].kiov_len <= PAGE_SIZE);
-                desc->bd_md.niov++;
-                desc->bd_md.length += bulk->bp_buflen;
-        }
-
-        LASSERT(desc->bd_md.niov == desc->bd_page_count);
-        LASSERT(desc->bd_md.niov != 0);
-
-        rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md, &desc->bd_md_h);
-
-        ptlrpc_put_bulk_iov(desc, iov); /*move down to reduce latency to send*/
-
-        if (rc != PTL_OK) {
-                CERROR("PtlMDBind failed: %d\n", rc);
-                LASSERT (rc == PTL_NOSPACE);
-                RETURN(-ENOMEM);
-        }
-
-        /* Client's bulk and reply matchbits are the same */
-        xid = desc->bd_req->rq_xid;
-        remote_id.nid = desc->bd_export->exp_connection->c_peer.peer_nid;
-        remote_id.pid = 0;
-
-        CDEBUG(D_NET, "Fetching %u pages %u bytes from portal %d on %s "
-               "nid "LPX64" pid %d xid "LPX64"\n",
-               desc->bd_md.niov, desc->bd_md.length, desc->bd_portal,
-               peer->peer_ni->pni_name, remote_id.nid, remote_id.pid,
-               xid);
-
-        desc->bd_network_rw = 1;
-        rc = PtlGet(desc->bd_md_h, remote_id, desc->bd_portal, 0,
-                    xid, 0);
+        if (desc->bd_type == BULK_PUT_SOURCE)
+                rc = PtlPut (desc->bd_md_h, PTL_ACK_REQ, remote_id,
+                             desc->bd_portal, 0, xid, 0, 0);
+        else
+                rc = PtlGet (desc->bd_md_h, remote_id,
+                             desc->bd_portal, 0, xid, 0);
+        
         if (rc != PTL_OK) {
-                desc->bd_network_rw = 0;
-                CERROR("PtlGet("LPU64", %d, "LPX64") failed: %d\n",
+                /* Can't send, so we unlink the MD bound above.  The UNLINK
+                 * event this creates will signal completion with failure,
+                 * so we return SUCCESS here! */
+                CERROR("Transfer("LPU64", %d, "LPX64") failed: %d\n",
                        remote_id.nid, desc->bd_portal, xid, rc);
                 rc2 = PtlMDUnlink(desc->bd_md_h);
                 LASSERT (rc2 == PTL_OK);
-                RETURN((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM);
         }
 
         RETURN(0);
@@ -323,166 +170,116 @@ void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc)
 {
         /* Server side bulk abort. Idempotent. Not thread-safe (i.e. only
          * serialises with completion callback) */
-        unsigned long      flags;
         struct l_wait_info lwi;
-        int                callback_count;
         int                rc;
 
         LASSERT (!in_interrupt ());             /* might sleep */
 
-        /* NB. server-side bulk gets 2 events, so we have to keep trying to
-         * unlink the MD until all callbacks have happened, or
-         * PtlMDUnlink() returns OK or INVALID */
- again:
-        spin_lock_irqsave (&desc->bd_lock, flags);
-        if (!desc->bd_network_rw) {
-                /* completed or never even registered. NB holding bd_lock
-                 * guarantees callback has completed if it ran. */
-                spin_unlock_irqrestore (&desc->bd_lock, flags);
-                return;
-        }
-
-        /* sample callback count while we have the lock */
-        callback_count = desc->bd_callback_count;
-        spin_unlock_irqrestore (&desc->bd_lock, flags);
+        if (!ptlrpc_bulk_active(desc))          /* completed or */
+                return;                         /* never started */
+        
+        /* The unlink ensures the callback happens ASAP and is the last
+         * one.  If it fails, it must be because completion just
+         * happened. */
 
         rc = PtlMDUnlink (desc->bd_md_h);
-        switch (rc) {
-        default:
-                CERROR("PtlMDUnlink returned %d\n", rc);
-                LBUG ();
-        case PTL_OK:                    /* Won the race with the network */
-                LASSERT (!desc->bd_complete); /* Not all callbacks ran */
-                desc->bd_network_rw = 0;
-                return;
-
-        case PTL_MD_INUSE:              /* MD is being accessed right now */
-                for (;;) {
-                        /* Network access will complete in finite time but the
-                         * timeout lets us CERROR for visibility */
-                        lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL);
-                        rc = l_wait_event(desc->bd_waitq,
-                                          desc->bd_callback_count !=
-                                          callback_count, &lwi);
-                        if (rc == -ETIMEDOUT) {
-                                CERROR("Unexpectedly long timeout: desc %p\n",
-                                       desc);
-                                continue;
-                        }
-                        LASSERT (rc == 0);
-                        break;
-                }
-                /* go back and try again... */
-                goto again;
-
-        case PTL_INV_MD:            /* Lost the race with completion */
-                LASSERT (desc->bd_complete);    /* Callbacks all ran */
-                LASSERT (!desc->bd_network_rw);
+        if (rc == PTL_INV_MD) {
+                LASSERT(!ptlrpc_bulk_active(desc));
                 return;
         }
+        
+        LASSERT (rc == PTL_OK);
+
+        for (;;) {
+                /* Network access will complete in finite time but the HUGE
+                 * timeout lets us CWARN for visibility of sluggish NALs */
+                lwi = LWI_TIMEOUT (300 * HZ, NULL, NULL);
+                rc = l_wait_event(desc->bd_waitq, 
+                                  !ptlrpc_bulk_active(desc), &lwi);
+                if (rc == 0)
+                        return;
+
+                LASSERT(rc == -ETIMEDOUT);
+                CWARN("Unexpectedly long timeout: desc %p\n", desc);
+        }
 }
 
 int ptlrpc_register_bulk (struct ptlrpc_request *req)
 {
         struct ptlrpc_bulk_desc *desc = req->rq_bulk;
         struct ptlrpc_peer *peer;
-        struct list_head *tmp, *next;
         int rc;
         int rc2;
-        ptl_kiov_t *iov;
         ptl_process_id_t source_id;
+        ptl_handle_me_t  me_h;
+        ptl_md_t         md;
         ENTRY;
 
         /* NB no locking required until desc is on the network */
+        LASSERT (desc->bd_nob > 0);
         LASSERT (!desc->bd_network_rw);
         LASSERT (desc->bd_page_count <= PTL_MD_MAX_PAGES);
         LASSERT (desc->bd_req != NULL);
         LASSERT (desc->bd_type == BULK_PUT_SINK ||
                  desc->bd_type == BULK_GET_SOURCE);
 
-        desc->bd_complete = 0;
-
-        iov = ptlrpc_get_bulk_iov (desc);
-        if (iov == NULL)
-                return (-ENOMEM);
+        desc->bd_success = 0;
 
         peer = &desc->bd_import->imp_connection->c_peer;
 
-        desc->bd_md.start = iov;
-        desc->bd_md.niov = 0;
-        desc->bd_md.length = 0;
-        desc->bd_md.threshold = 1;
-        desc->bd_md.user_ptr = desc;
-
-        if (desc->bd_type == BULK_GET_SOURCE) {
-                desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_KIOV;
-                desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_source_eq_h;
-        } else {
-                desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_KIOV;
-                desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_sink_eq_h;
-        }
-
-        list_for_each_safe(tmp, next, &desc->bd_page_list) {
-                struct ptlrpc_bulk_page *bulk;
-                bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
-
-                LASSERT(desc->bd_md.niov < desc->bd_page_count);
-
-                iov[desc->bd_md.niov].kiov_page = bulk->bp_page;
-                iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen;
-                iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset;
-
-                LASSERT (bulk->bp_pageoffset + bulk->bp_buflen <= PAGE_SIZE);
-                desc->bd_md.niov++;
-                desc->bd_md.length += bulk->bp_buflen;
-        }
-
-        LASSERT(desc->bd_md.niov == desc->bd_page_count);
-        LASSERT(desc->bd_md.niov != 0);
+        md.start = &desc->bd_iov[0];
+        md.niov = desc->bd_page_count;
+        md.length = desc->bd_nob;
+        md.eventq = peer->peer_ni->pni_eq_h;
+        md.threshold = 1;                       /* PUT or GET */
+        md.options = (desc->bd_type == BULK_GET_SOURCE) ? 
+                     PTL_MD_OP_GET : PTL_MD_OP_PUT;
+#ifdef __KERNEL__
+        md.options |= PTL_MD_KIOV;
+#else
+        md.options |= PTL_MD_IOV;
+#endif
+        md.user_ptr = &desc->bd_cbid;
+        LASSERT (desc->bd_cbid.cbid_fn == client_bulk_callback);
+        LASSERT (desc->bd_cbid.cbid_arg == desc);
 
         /* XXX Registering the same xid on retried bulk makes my head
          * explode trying to understand how the original request's bulk
          * might interfere with the retried request -eeb */
         LASSERT (!desc->bd_registered || req->rq_xid != desc->bd_last_xid);
         desc->bd_registered = 1;
-        desc->bd_last_xid = desc->bd_last_xid;
+        desc->bd_last_xid = req->rq_xid;
 
         source_id.nid = desc->bd_import->imp_connection->c_peer.peer_nid;
         source_id.pid = PTL_PID_ANY;
 
         rc = PtlMEAttach(peer->peer_ni->pni_ni_h,
                          desc->bd_portal, source_id, req->rq_xid, 0,
-                         PTL_UNLINK, PTL_INS_AFTER, &desc->bd_me_h);
-
+                         PTL_UNLINK, PTL_INS_AFTER, &me_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMEAttach failed: %d\n", rc);
                 LASSERT (rc == PTL_NOSPACE);
-                GOTO(out, rc = -ENOMEM);
+                RETURN (-ENOMEM);
         }
 
         /* About to let the network at it... */
         desc->bd_network_rw = 1;
-        rc = PtlMDAttach(desc->bd_me_h, desc->bd_md, PTL_UNLINK,
-                         &desc->bd_md_h);
+        rc = PtlMDAttach(me_h, md, PTL_UNLINK, &desc->bd_md_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMDAttach failed: %d\n", rc);
                 LASSERT (rc == PTL_NOSPACE);
                 desc->bd_network_rw = 0;
-                rc2 = PtlMEUnlink (desc->bd_me_h);
+                rc2 = PtlMEUnlink (me_h);
                 LASSERT (rc2 == PTL_OK);
-                GOTO(out, rc = -ENOMEM);
+                RETURN (-ENOMEM);
         }
-        rc = 0;
 
         CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPX64", "
                "portal %u on %s\n",
                desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
-               desc->bd_md.niov, desc->bd_md.length,
+               md.niov, md.length,
                req->rq_xid, desc->bd_portal, peer->peer_ni->pni_name);
-
- out:
-        ptlrpc_put_bulk_iov (desc, iov);
-        RETURN(rc);
+        RETURN(0);
 }
 
 void ptlrpc_unregister_bulk (struct ptlrpc_request *req)
@@ -491,101 +288,104 @@ void ptlrpc_unregister_bulk (struct ptlrpc_request *req)
          * thread-safe (i.e. only interlocks with completion callback). */
         struct ptlrpc_bulk_desc *desc = req->rq_bulk;
         wait_queue_head_t       *wq;
-        unsigned long            flags;
         struct l_wait_info       lwi;
         int                      rc;
 
         LASSERT (!in_interrupt ());             /* might sleep */
 
-        spin_lock_irqsave (&desc->bd_lock, flags);
-        if (!desc->bd_network_rw) {     /* completed or never even registered */
-                spin_unlock_irqrestore (&desc->bd_lock, flags);
-                return;
-        }
-        spin_unlock_irqrestore (&desc->bd_lock, flags);
+        if (!ptlrpc_bulk_active(desc))          /* completed or */
+                return;                         /* never registered */
+        
+        LASSERT (desc->bd_req == req);          /* bd_req NULL until registered */
 
-        LASSERT (desc->bd_req == req);     /* NB bd_req NULL until registered */
+        /* the unlink ensures the callback happens ASAP and is the last
+         * one.  If it fails, it must be because completion just
+         * happened. */
 
-        /* NB...
-         * 1. If the MD unlink is successful, the ME gets unlinked too.
-         * 2. Since client-side bulk only gets a single event and a
-         * .. threshold of 1.  If the MD was inuse at the first link
-         * .. attempt, the callback is due any minute, and the MD/ME will
-         * .. unlink themselves.
-         */
         rc = PtlMDUnlink (desc->bd_md_h);
-        switch (rc) {
-        default:
-                CERROR("PtlMDUnlink returned %d\n", rc);
-                LBUG ();
-        case PTL_OK:                          /* Won the race with completion */
-                LASSERT (!desc->bd_complete);   /* Callback hasn't happened */
-                desc->bd_network_rw = 0;
-                return;
-        case PTL_MD_INUSE:                  /* MD is being accessed right now */
-                for (;;) {
-                        /* Network access will complete in finite time but the
-                         * timeout lets us CERROR for visibility */
-                        if (desc->bd_req->rq_set != NULL)
-                                wq = &req->rq_set->set_waitq;
-                        else
-                                wq = &req->rq_reply_waitq;
-                        lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL);
-                        rc = l_wait_event(*wq, ptlrpc_bulk_complete(desc), &lwi);
-                        LASSERT (rc == 0 || rc == -ETIMEDOUT);
-                        if (rc == 0)
-                                break;
-                        CERROR ("Unexpectedly long timeout: desc %p\n", desc);
-                        LBUG();
-                }
-                /* Fall through */
-        case PTL_INV_MD:                     /* Lost the race with completion */
-                LASSERT (desc->bd_complete);/* Callback has run to completion */
-                LASSERT (!desc->bd_network_rw);
+        if (rc == PTL_INV_MD) {
+                LASSERT(!ptlrpc_bulk_active(desc));
                 return;
         }
+        
+        LASSERT (rc == PTL_OK);
+        
+        if (desc->bd_req->rq_set != NULL)
+                wq = &req->rq_set->set_waitq;
+        else
+                wq = &req->rq_reply_waitq;
+
+        for (;;) {
+                /* Network access will complete in finite time but the HUGE
+                 * timeout lets us CWARN for visibility of sluggish NALs */
+                lwi = LWI_TIMEOUT (300 * HZ, NULL, NULL);
+                rc = l_wait_event(*wq, !ptlrpc_bulk_active(desc), &lwi);
+                if (rc == 0)
+                        return;
+                
+                LASSERT (rc == -ETIMEDOUT);
+                CWARN("Unexpectedly long timeout: desc %p\n", desc);
+        }
 }
 
-int ptlrpc_reply(struct ptlrpc_request *req)
+int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
 {
-        struct ptlrpc_connection *conn;
-        unsigned long flags;
-        int rc;
+        struct ptlrpc_service     *svc = req->rq_rqbd->rqbd_srv_ni->sni_service;
+        struct ptlrpc_reply_state *rs = req->rq_reply_state;
+        struct ptlrpc_connection  *conn;
+        int                        rc;
 
         /* We must already have a reply buffer (only ptlrpc_error() may be
          * called without one).  We must also have a request buffer which
          * is either the actual (swabbed) incoming request, or a saved copy
          * if this is a req saved in target_queue_final_reply(). */
-        LASSERT (req->rq_repmsg != NULL);
         LASSERT (req->rq_reqmsg != NULL);
+        LASSERT (rs != NULL);
+        LASSERT (req->rq_repmsg != NULL);
+        LASSERT (may_be_difficult || !rs->rs_difficult);
+        LASSERT (req->rq_repmsg == &rs->rs_msg);
+        LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
+        LASSERT (rs->rs_cb_id.cbid_arg == rs);
 
-        /* FIXME: we need to increment the count of handled events */
+        LASSERT (req->rq_repmsg != NULL);
         if (req->rq_type != PTL_RPC_MSG_ERR)
                 req->rq_type = PTL_RPC_MSG_REPLY;
 
+        req->rq_repmsg->type   = req->rq_type;
         req->rq_repmsg->status = req->rq_status;
-        req->rq_repmsg->opc = req->rq_reqmsg->opc;
+        req->rq_repmsg->opc    = req->rq_reqmsg->opc;
 
         if (req->rq_export == NULL) 
                 conn = ptlrpc_get_connection(&req->rq_peer, NULL);
         else
                 conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
 
-        init_waitqueue_head(&req->rq_reply_waitq);
-        rc = ptl_send_buf(req, conn, 
-                          req->rq_svc->srv_rep_portal);
-        if (rc != 0) {
-                /* Do what the callback handler would have done */
-                OBD_FREE (req->rq_repmsg, req->rq_replen);
+        atomic_inc (&svc->srv_outstanding_replies);
 
-                spin_lock_irqsave (&req->rq_lock, flags);
-                req->rq_want_ack = 0;
-                spin_unlock_irqrestore (&req->rq_lock, flags);
+        rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen,
+                           rs->rs_difficult ? PTL_ACK_REQ : PTL_NOACK_REQ,
+                           &rs->rs_cb_id, conn,
+                           svc->srv_rep_portal, req->rq_xid);
+        if (rc != 0) {
+                atomic_dec (&svc->srv_outstanding_replies);
+
+                if (!rs->rs_difficult) {
+                        /* Callers other than target_send_reply() expect me
+                         * to clean up on a comms error */
+                        lustre_free_reply_state (rs);
+                        req->rq_reply_state = NULL;
+                        req->rq_repmsg = NULL;
+                }
         }
         ptlrpc_put_connection(conn);
         return rc;
 }
 
+int ptlrpc_reply (struct ptlrpc_request *req)
+{
+        return (ptlrpc_send_reply (req, 0));
+}
+
 int ptlrpc_error(struct ptlrpc_request *req)
 {
         int rc;
@@ -597,10 +397,9 @@ int ptlrpc_error(struct ptlrpc_request *req)
                         RETURN(rc);
         }
 
-
         req->rq_type = PTL_RPC_MSG_ERR;
 
-        rc = ptlrpc_reply(req);
+        rc = ptlrpc_send_reply (req, 0);
         RETURN(rc);
 }
 
@@ -612,6 +411,7 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         unsigned long flags;
         ptl_process_id_t source_id;
         ptl_handle_me_t  reply_me_h;
+        ptl_md_t         reply_md;
         ENTRY;
 
         LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST);
@@ -629,6 +429,7 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         }
 
         request->rq_reqmsg->handle = request->rq_import->imp_remote_handle;
+        request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST;
         request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt;
 
         source_id.nid = connection->c_peer.peer_nid;
@@ -639,7 +440,7 @@ int ptl_send_rpc(struct ptlrpc_request *request)
                 OBD_ALLOC(request->rq_repmsg, request->rq_replen);
         if (request->rq_repmsg == NULL) {
                 LBUG();
-                RETURN(-ENOMEM);
+                GOTO(cleanup_bulk, rc = -ENOMEM);
         }
 
         rc = PtlMEAttach(connection->c_peer.peer_ni->pni_ni_h,
@@ -650,24 +451,34 @@ int ptl_send_rpc(struct ptlrpc_request *request)
                 CERROR("PtlMEAttach failed: %d\n", rc);
                 LASSERT (rc == PTL_NOSPACE);
                 LBUG();
-                GOTO(cleanup, rc = -ENOMEM);
+                GOTO(cleanup_repmsg, rc = -ENOMEM);
         }
 
-        request->rq_reply_md.start = request->rq_repmsg;
-        request->rq_reply_md.length = request->rq_replen;
-        request->rq_reply_md.threshold = 1;
-        request->rq_reply_md.options = PTL_MD_OP_PUT;
-        request->rq_reply_md.user_ptr = request;
-        request->rq_reply_md.eventq = 
-                connection->c_peer.peer_ni->pni_reply_in_eq_h;
+        spin_lock_irqsave (&request->rq_lock, flags);
+        /* If the MD attach succeeds, there _will_ be a reply_in callback */
+        request->rq_receiving_reply = 1;
+        /* Clear any flags that may be present from previous sends. */
+        request->rq_replied = 0;
+        request->rq_err = 0;
+        request->rq_timedout = 0;
+        request->rq_resend = 0;
+        request->rq_restart = 0;
+        spin_unlock_irqrestore (&request->rq_lock, flags);
 
-        rc = PtlMDAttach(reply_me_h, request->rq_reply_md,
-                         PTL_UNLINK, &request->rq_reply_md_h);
+        reply_md.start     = request->rq_repmsg;
+        reply_md.length    = request->rq_replen;
+        reply_md.threshold = 1;
+        reply_md.options   = PTL_MD_OP_PUT;
+        reply_md.user_ptr  = &request->rq_reply_cbid;
+        reply_md.eventq    = connection->c_peer.peer_ni->pni_eq_h;
+
+        rc = PtlMDAttach(reply_me_h, reply_md, PTL_UNLINK, 
+                         &request->rq_reply_md_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMDAttach failed: %d\n", rc);
                 LASSERT (rc == PTL_NOSPACE);
                 LBUG();
-                GOTO(cleanup2, rc -ENOMEM);
+                GOTO(cleanup_me, rc -ENOMEM);
         }
 
         CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
@@ -676,87 +487,102 @@ int ptl_send_rpc(struct ptlrpc_request *request)
                request->rq_reply_portal,
                connection->c_peer.peer_ni->pni_name);
 
-        ptlrpc_request_addref(request);        /* 1 ref for the SENT callback */
-
-        spin_lock_irqsave (&request->rq_lock, flags);
-        request->rq_receiving_reply = 1;
-        /* Clear any flags that may be present from previous sends. */
-        request->rq_replied = 0;
-        request->rq_err = 0;
-        request->rq_timedout = 0;
-        request->rq_resend = 0;
-        request->rq_restart = 0;
-        spin_unlock_irqrestore (&request->rq_lock, flags);
+        ptlrpc_request_addref(request);        /* +1 ref for the SENT callback */
 
         request->rq_sent = LTIME_S(CURRENT_TIME);
         ptlrpc_pinger_sending_on_import(request->rq_import);
-        rc = ptl_send_buf(request, connection, request->rq_request_portal);
+        rc = ptl_send_buf(&request->rq_req_md_h, 
+                          request->rq_reqmsg, request->rq_reqlen,
+                          PTL_NOACK_REQ, &request->rq_req_cbid, 
+                          connection,
+                          request->rq_request_portal,
+                          request->rq_xid);
         if (rc == 0) {
                 ptlrpc_lprocfs_rpc_sent(request);
                 RETURN(rc);
         }
 
-        spin_lock_irqsave (&request->rq_lock, flags);
-        request->rq_receiving_reply = 0;
-        spin_unlock_irqrestore (&request->rq_lock, flags);
         ptlrpc_req_finished (request);          /* drop callback ref */
- cleanup2:
+
+ cleanup_me:
         /* MEUnlink is safe; the PUT didn't even get off the ground, and
          * nobody apart from the PUT's target has the right nid+XID to
          * access the reply buffer. */
         rc2 = PtlMEUnlink(reply_me_h);
         LASSERT (rc2 == PTL_OK);
- cleanup:
+        /* UNLINKED callback called synchronously */
+        LASSERT (!request->rq_receiving_reply);
+
+ cleanup_repmsg:
         OBD_FREE(request->rq_repmsg, request->rq_replen);
         request->rq_repmsg = NULL;
+
+ cleanup_bulk:
+        if (request->rq_bulk != NULL)
+                ptlrpc_unregister_bulk(request);
+
         return rc;
 }
 
-void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd)
+void ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd)
 {
-        struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni;
-        struct ptlrpc_service *service = srv_ni->sni_service;
-        static ptl_process_id_t match_id = {PTL_NID_ANY, PTL_PID_ANY};
-        int rc;
-        ptl_md_t dummy;
-        ptl_handle_md_t md_h;
-
-        LASSERT(atomic_read(&rqbd->rqbd_refcount) == 0);
+        struct ptlrpc_srv_ni    *srv_ni = rqbd->rqbd_srv_ni;
+        struct ptlrpc_service   *service = srv_ni->sni_service;
+        static ptl_process_id_t  match_id = {PTL_NID_ANY, PTL_PID_ANY};
+        int                      rc;
+        ptl_md_t                 md;
+        ptl_handle_me_t          me_h;
+        unsigned long            flags;
 
         CDEBUG(D_NET, "PtlMEAttach: portal %d on %s h %lx."LPX64"\n",
                service->srv_req_portal, srv_ni->sni_ni->pni_name,
                srv_ni->sni_ni->pni_ni_h.nal_idx,
                srv_ni->sni_ni->pni_ni_h.cookie);
 
-        /* Attach the leading ME on which we build the ring */
         rc = PtlMEAttach(srv_ni->sni_ni->pni_ni_h, service->srv_req_portal,
-                         match_id, 0, ~0,
-                         PTL_UNLINK, PTL_INS_AFTER, &rqbd->rqbd_me_h);
+                         match_id, 0, ~0, PTL_UNLINK, PTL_INS_AFTER, &me_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMEAttach failed: %d\n", rc);
-                /* BUG 1191 */
-                LBUG();
+                GOTO (failed, NULL);
         }
 
-        dummy.start      = rqbd->rqbd_buffer;
-        dummy.length     = service->srv_buf_size;
-        dummy.max_size   = service->srv_max_req_size;
-        dummy.threshold  = PTL_MD_THRESH_INF;
-        dummy.options    = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | PTL_MD_AUTO_UNLINK;
-        dummy.user_ptr   = rqbd;
-        dummy.eventq     = srv_ni->sni_eq_h;
-
-        atomic_inc(&srv_ni->sni_nrqbds_receiving);
-        atomic_set(&rqbd->rqbd_refcount, 1);   /* 1 ref for portals */
-
-        rc = PtlMDAttach(rqbd->rqbd_me_h, dummy, PTL_UNLINK, &md_h);
-        if (rc != PTL_OK) {
-                CERROR("PtlMDAttach failed: %d\n", rc);
-                LASSERT (rc == PTL_NOSPACE);
-                LBUG();
-                /* BUG 1191 */
-                PtlMEUnlink (rqbd->rqbd_me_h);
-                atomic_set(&rqbd->rqbd_refcount, 0);
-                atomic_dec(&srv_ni->sni_nrqbds_receiving);
+        LASSERT(rqbd->rqbd_refcount == 0);
+        rqbd->rqbd_refcount = 1;
+
+        md.start      = rqbd->rqbd_buffer;
+        md.length     = service->srv_buf_size;
+        md.max_size   = service->srv_max_req_size;
+        md.threshold  = PTL_MD_THRESH_INF;
+        md.options    = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | PTL_MD_AUTO_UNLINK;
+        md.user_ptr   = &rqbd->rqbd_cbid;
+        md.eventq     = srv_ni->sni_ni->pni_eq_h;
+        
+        spin_lock_irqsave (&service->srv_lock, flags);
+        srv_ni->sni_nrqbd_receiving++;
+        spin_unlock_irqrestore (&service->srv_lock, flags);
+
+        rc = PtlMDAttach(me_h, md, PTL_UNLINK, &rqbd->rqbd_md_h);
+        if (rc == PTL_OK)
+                return;
+        
+        CERROR("PtlMDAttach failed: %d\n", rc);
+        LASSERT (rc == PTL_NOSPACE);
+        rc = PtlMEUnlink (me_h);
+        LASSERT (rc == PTL_OK);
+
+        spin_lock_irqsave (&service->srv_lock, flags);
+        srv_ni->sni_nrqbd_receiving--;
+        if (srv_ni->sni_nrqbd_receiving == 0) {
+                /* This service is off-air on this interface because all
+                 * its request buffers are busy.  Portals will have started
+                 * dropping incoming requests until more buffers get
+                 * posted */
+                CERROR("All %s %s request buffers busy\n",
+                       service->srv_name, srv_ni->sni_ni->pni_name);
         }
+        spin_unlock_irqrestore (&service->srv_lock, flags);
+
+ failed:
+        LBUG();                /* BUG 1191 */
+        /* put req on a retry list? */
 }
index d29fe39..16ca32a 100644 (file)
@@ -31,6 +31,7 @@
 #endif
 
 #include <linux/obd_support.h>
+#include <linux/obd_class.h>
 #include <linux/lustre_net.h>
 
 
@@ -42,54 +43,114 @@ int lustre_msg_swabbed(struct lustre_msg *msg)
         return (msg->magic == __swab32(PTLRPC_MSG_MAGIC));
 }
 
-static int lustre_pack_msg(int count, int *lens, char **bufs, int *len,
-                           struct lustre_msg **msg)
+static void
+lustre_init_msg (struct lustre_msg *msg, int count, int *lens, char **bufs)
 {
         char *ptr;
-        struct lustre_msg *m;
-        int size = 0, i;
-
-        size = HDR_SIZE (count);
+        int   i;
+        
+        msg->magic = PTLRPC_MSG_MAGIC;
+        msg->version = PTLRPC_MSG_VERSION;
+        msg->bufcount = count;
         for (i = 0; i < count; i++)
-                size += size_round(lens[i]);
-
-        *len = size;
+                msg->buflens[i] = lens[i];
 
-        OBD_ALLOC(*msg, *len);
-        if (!*msg)
-                RETURN(-ENOMEM);
-
-        m = *msg;
-        m->magic = PTLRPC_MSG_MAGIC;
-        m->version = PTLRPC_MSG_VERSION;
-        m->bufcount = count;
-        for (i = 0; i < count; i++)
-                m->buflens[i] = lens[i];
+        if (bufs == NULL)
+                return;
 
-        ptr = (char *)m + HDR_SIZE(count);
+        ptr = (char *)msg + HDR_SIZE(count);
         for (i = 0; i < count; i++) {
-                char *tmp = NULL;
-                if (bufs)
-                        tmp = bufs[i];
+                char *tmp = bufs[i];
                 LOGL(tmp, lens[i], ptr);
-
         }
+}
+
+int lustre_pack_request (struct ptlrpc_request *req, 
+                         int count, int *lens, char **bufs)
+{
+        ENTRY;
+        
+        req->rq_reqlen = lustre_msg_size (count, lens);
+        OBD_ALLOC(req->rq_reqmsg, req->rq_reqlen);
+        if (req->rq_reqmsg == NULL)
+                RETURN(-ENOMEM);
 
-        return 0;
+        lustre_init_msg (req->rq_reqmsg, count, lens, bufs);
+        RETURN (0);
 }
 
-int lustre_pack_request(struct ptlrpc_request *req, int count, int *lens,
-                        char **bufs)
+#if RS_DEBUG
+LIST_HEAD(ptlrpc_rs_debug_lru);
+spinlock_t ptlrpc_rs_debug_lock = SPIN_LOCK_UNLOCKED;
+
+#define PTLRPC_RS_DEBUG_LRU_ADD(rs)                                     \
+do {                                                                    \
+        unsigned long __flags;                                          \
+                                                                        \
+        spin_lock_irqsave(&ptlrpc_rs_debug_lock, __flags);              \
+        list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru);      \
+        spin_unlock_irqrestore(&ptlrpc_rs_debug_lock, __flags);         \
+} while (0)
+
+#define PTLRPC_RS_DEBUG_LRU_DEL(rs)                                     \
+do {                                                                    \
+        unsigned long __flags;                                          \
+                                                                        \
+        spin_lock_irqsave(&ptlrpc_rs_debug_lock, __flags);              \
+        list_del(&(rs)->rs_debug_list);                                 \
+        spin_unlock_irqrestore(&ptlrpc_rs_debug_lock, __flags);         \
+} while (0)
+#else
+# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0)
+# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0)
+#endif
+
+int lustre_pack_reply (struct ptlrpc_request *req,
+                       int count, int *lens, char **bufs)
 {
-        return lustre_pack_msg(count, lens, bufs, &req->rq_reqlen,
-                               &req->rq_reqmsg);
+        struct ptlrpc_reply_state *rs;
+        int                        msg_len;
+        int                        size;
+        ENTRY;
+
+        LASSERT (req->rq_reply_state == NULL);
+
+        msg_len = lustre_msg_size (count, lens);
+        size = offsetof (struct ptlrpc_reply_state, rs_msg) + msg_len;
+        OBD_ALLOC (rs, size);
+        if (rs == NULL)
+                RETURN (-ENOMEM);
+
+        rs->rs_cb_id.cbid_fn = reply_out_callback;
+        rs->rs_cb_id.cbid_arg = rs;
+        rs->rs_srv_ni = req->rq_rqbd->rqbd_srv_ni;
+        rs->rs_size = size;
+        INIT_LIST_HEAD(&rs->rs_exp_list);
+        INIT_LIST_HEAD(&rs->rs_obd_list);
+
+        req->rq_replen = msg_len;
+        req->rq_reply_state = rs;
+        req->rq_repmsg = &rs->rs_msg;
+        lustre_init_msg (&rs->rs_msg, count, lens, bufs);
+
+        PTLRPC_RS_DEBUG_LRU_ADD(rs);
+
+        RETURN (0);
 }
 
-int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
-                      char **bufs)
+void lustre_free_reply_state (struct ptlrpc_reply_state *rs)
 {
-        return lustre_pack_msg(count, lens, bufs, &req->rq_replen,
-                               &req->rq_repmsg);
+        PTLRPC_RS_DEBUG_LRU_DEL(rs);
+
+        LASSERT (!rs->rs_difficult || rs->rs_handled);
+        LASSERT (!rs->rs_on_net);
+        LASSERT (!rs->rs_scheduled);
+        LASSERT (rs->rs_export == NULL);
+        LASSERT (rs->rs_nlocks == 0);
+        LASSERT (list_empty(&rs->rs_exp_list));
+        LASSERT (list_empty(&rs->rs_obd_list));
+
+        OBD_FREE (rs, rs->rs_size);
 }
 
 /* This returns the size of the buffer that is required to hold a lustre_msg
index 3caf74e..ab85900 100644 (file)
 #include <linux/obd_class.h>
 #include "ptlrpc_internal.h"
 
-#ifdef __KERNEL__
-
-static struct ptlrpc_thread *pinger_thread = NULL;
 static DECLARE_MUTEX(pinger_sem);
 static struct list_head pinger_imports = LIST_HEAD_INIT(pinger_imports);
 
+#ifdef __KERNEL__
+static struct ptlrpc_thread *pinger_thread = NULL;
+
 static int ptlrpc_pinger_main(void *arg)
 {
         struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg;
@@ -307,30 +307,219 @@ int ptlrpc_pinger_del_import(struct obd_import *imp)
         RETURN(0);
 }
 
-#else /* !__KERNEL__ */
+#else
+/* XXX
+ * the current implementation of pinger in liblustre is not optimized
+ */
+
+static struct pinger_data {
+        int             pd_recursion;
+        unsigned long   pd_this_ping;
+        unsigned long   pd_next_ping;
+        struct ptlrpc_request_set *pd_set;
+} pinger_args;
+
+static int pinger_check_rpcs(void *arg)
+{
+        unsigned long curtime = time(NULL);
+        struct ptlrpc_request *req;
+        struct ptlrpc_request_set *set;
+        struct list_head *iter;
+        struct pinger_data *pd = &pinger_args;
+        int rc;
+
+        /* prevent recursion */
+        if (pd->pd_recursion++) {
+                CDEBUG(D_HA, "pinger: recursion! quit\n");
+                LASSERT(pd->pd_set);
+                pd->pd_recursion--;
+                return 0;
+        }
+
+        /* have we reached ping point? */
+        if (!pd->pd_set && pd->pd_next_ping > curtime) {
+                pd->pd_recursion--;
+                return 0;
+        }
+
+        /* if we have rpc_set already, continue processing it */
+        if (pd->pd_set) {
+                LASSERT(pd->pd_this_ping);
+                set = pd->pd_set;
+                goto do_check_set;
+        }
+
+        pd->pd_this_ping = curtime;
+        pd->pd_set = ptlrpc_prep_set();
+        set = pd->pd_set;
+
+        /* add rpcs into set */
+        down(&pinger_sem);
+        list_for_each(iter, &pinger_imports) {
+                struct obd_import *imp =
+                        list_entry(iter, struct obd_import,
+                                   imp_pinger_chain);
+                int generation, level;
+                unsigned long flags;
+
+                if (imp->imp_next_ping <= pd->pd_this_ping) {
+                        /* Add a ping. */
+                        spin_lock_irqsave(&imp->imp_lock, flags);
+                        generation = imp->imp_generation;
+                        level = imp->imp_state;
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+                        if (level != LUSTRE_IMP_FULL) {
+                                CDEBUG(D_HA,
+                                       "not pinging %s (in recovery)\n",
+                                       imp->imp_target_uuid.uuid);
+                                continue;
+                        }
+
+                        req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL,
+                                              NULL);
+                        if (!req) {
+                                CERROR("out of memory\n");
+                                break;
+                        }
+                        req->rq_no_resend = 1;
+                        req->rq_replen = lustre_msg_size(0, NULL);
+                        req->rq_send_state = LUSTRE_IMP_FULL;
+                        req->rq_phase = RQ_PHASE_RPC;
+                        req->rq_import_generation = generation;
+                        ptlrpc_set_add_req(set, req);
+                } else {
+                        CDEBUG(D_HA, "don't need to ping %s (%lu > "
+                               "%lu)\n", imp->imp_target_uuid.uuid,
+                               imp->imp_next_ping, pd->pd_this_ping);
+                }
+        }
+        pd->pd_this_ping = curtime;
+        up(&pinger_sem);
+
+        /* Might be empty, that's OK. */
+        if (set->set_remaining == 0)
+                CDEBUG(D_HA, "nothing to ping\n");
+
+        list_for_each(iter, &set->set_requests) {
+                struct ptlrpc_request *req =
+                        list_entry(iter, struct ptlrpc_request,
+                                   rq_set_chain);
+                DEBUG_REQ(D_HA, req, "pinging %s->%s",
+                          req->rq_import->imp_obd->obd_uuid.uuid,
+                          req->rq_import->imp_target_uuid.uuid);
+                (void)ptl_send_rpc(req);
+        }
+
+do_check_set:
+        rc = ptlrpc_check_set(set);
+
+        /* not finished, and we are not expired, simply return */
+        if (!rc && curtime < pd->pd_this_ping + obd_timeout) {
+                CDEBUG(D_HA, "not finished, but also not expired\n");
+                pd->pd_recursion--;
+                return 0;
+        }
+
+        /* Expire all the requests that didn't come back. */
+        down(&pinger_sem);
+        list_for_each(iter, &set->set_requests) {
+                req = list_entry(iter, struct ptlrpc_request,
+                                 rq_set_chain);
+
+                if (req->rq_replied)
+                        continue;
+
+                req->rq_phase = RQ_PHASE_COMPLETE;
+                set->set_remaining--;
+                /* If it was disconnected, don't sweat it. */
+                if (list_empty(&req->rq_import->imp_pinger_chain)) {
+                        ptlrpc_unregister_reply(req);
+                        continue;
+                }
+
+                CDEBUG(D_HA, "pinger initiate expire_one_request\n");
+                ptlrpc_expire_one_request(req);
+        }
+        up(&pinger_sem);
+
+        ptlrpc_set_destroy(set);
+        pd->pd_set = NULL;
+
+        pd->pd_next_ping = pd->pd_this_ping + obd_timeout;
+        pd->pd_this_ping = 0; /* XXX for debug */
+
+        CDEBUG(D_HA, "finished a round ping\n");
+        pd->pd_recursion--;
+        return 0;
+}
+
+static void *pinger_callback = NULL;
 
 int ptlrpc_start_pinger(void)
 {
+        memset(&pinger_args, 0, sizeof(pinger_args));
+#ifdef ENABLE_PINGER
+        pinger_callback =
+                liblustre_register_wait_callback(&pinger_check_rpcs, &pinger_args);
+#endif
+        obd_timeout = 10;
         return 0;
 }
 
 int ptlrpc_stop_pinger(void)
 {
+#ifdef ENABLE_PINGER
+        if (pinger_callback)
+                liblustre_deregister_wait_callback(pinger_callback);
+#endif
         return 0;
 }
 
-int ptlrpc_pinger_add_import(struct obd_import *imp)
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
 {
-        return 0;
+        down(&pinger_sem);
+        imp->imp_next_ping = time(NULL) + obd_timeout;
+        if (pinger_args.pd_set == NULL &&
+            pinger_args.pd_next_ping > imp->imp_next_ping) {
+                CDEBUG(D_HA, "set next ping to %ld(cur %ld)\n",
+                        imp->imp_next_ping, time(NULL));
+                pinger_args.pd_next_ping = imp->imp_next_ping;
+        }
+        up(&pinger_sem);
 }
 
-int ptlrpc_pinger_del_import(struct obd_import *imp)
+int ptlrpc_pinger_add_import(struct obd_import *imp)
 {
-        return 0;
+        ENTRY;
+        if (!list_empty(&imp->imp_pinger_chain))
+                RETURN(-EALREADY);
+
+        CDEBUG(D_HA, "adding pingable import %s->%s\n",
+               imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+        ptlrpc_pinger_sending_on_import(imp);
+
+        down(&pinger_sem);
+        list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
+        class_import_get(imp);
+        up(&pinger_sem);
+
+        RETURN(0);
 }
 
-void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
+int ptlrpc_pinger_del_import(struct obd_import *imp)
 {
+        ENTRY;
+        if (list_empty(&imp->imp_pinger_chain))
+                RETURN(-ENOENT);
+
+        down(&pinger_sem);
+        list_del_init(&imp->imp_pinger_chain);
+        CDEBUG(D_HA, "removing pingable import %s->%s\n",
+               imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+        class_import_put(imp);
+        up(&pinger_sem);
+        RETURN(0);
 }
 
-#endif
+#endif /* !__KERNEL__ */
index 7ec9bbe..d42eb65 100644 (file)
@@ -99,9 +99,9 @@ static inline int opcode_offset(__u32 opc) {
                             (OBD_LAST_OPC - OBD_FIRST_OPC))
 
 enum {
-        PTLRPC_REQWAIT_CNTR     = 0,
-        PTLRPC_SVCIDLETIME_CNTR = 1,
-        //PTLRPC_SVCEQDEPTH_CNTR,
+        PTLRPC_REQWAIT_CNTR = 0,
+        PTLRPC_REQQDEPTH_CNTR,
+        PTLRPC_REQACTIVE_CNTR,
         PTLRPC_LAST_CNTR
 };
 
index bfe525c..519b434 100644 (file)
@@ -83,16 +83,15 @@ EXPORT_SYMBOL(ptlrpc_init_connection);
 EXPORT_SYMBOL(ptlrpc_cleanup_connection);
 
 /* niobuf.c */
-EXPORT_SYMBOL(ptlrpc_bulk_put);
-EXPORT_SYMBOL(ptlrpc_bulk_get);
+EXPORT_SYMBOL(ptlrpc_start_bulk_transfer);
 EXPORT_SYMBOL(ptlrpc_abort_bulk);
 EXPORT_SYMBOL(ptlrpc_register_bulk);
 EXPORT_SYMBOL(ptlrpc_unregister_bulk);
+EXPORT_SYMBOL(ptlrpc_send_reply);
 EXPORT_SYMBOL(ptlrpc_reply);
 EXPORT_SYMBOL(ptlrpc_error);
 EXPORT_SYMBOL(ptlrpc_resend_req);
 EXPORT_SYMBOL(ptl_send_rpc);
-EXPORT_SYMBOL(ptlrpc_link_svc_me);
 
 /* client.c */
 EXPORT_SYMBOL(ptlrpc_init_client);
@@ -111,7 +110,6 @@ EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
 EXPORT_SYMBOL(ptlrpc_prep_bulk_exp);
 EXPORT_SYMBOL(ptlrpc_free_bulk);
 EXPORT_SYMBOL(ptlrpc_prep_bulk_page);
-EXPORT_SYMBOL(ptlrpc_free_bulk_page);
 EXPORT_SYMBOL(ptlrpc_abort_inflight);
 EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
 EXPORT_SYMBOL(ptlrpc_next_xid);
@@ -128,6 +126,9 @@ EXPORT_SYMBOL(ptlrpc_interrupted_set);
 EXPORT_SYMBOL(ptlrpc_mark_interrupted);
 
 /* service.c */
+EXPORT_SYMBOL(ptlrpc_save_lock);
+EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
+EXPORT_SYMBOL(ptlrpc_commit_replies);
 EXPORT_SYMBOL(ptlrpc_init_svc);
 EXPORT_SYMBOL(ptlrpc_stop_all_threads);
 EXPORT_SYMBOL(ptlrpc_start_n_threads);
@@ -138,6 +139,7 @@ EXPORT_SYMBOL(ptlrpc_unregister_service);
 EXPORT_SYMBOL(lustre_msg_swabbed);
 EXPORT_SYMBOL(lustre_pack_request);
 EXPORT_SYMBOL(lustre_pack_reply);
+EXPORT_SYMBOL(lustre_free_reply_state);
 EXPORT_SYMBOL(lustre_msg_size);
 EXPORT_SYMBOL(lustre_unpack_msg);
 EXPORT_SYMBOL(lustre_msg_buf);
index 7b56097..4e688a8 100644 (file)
@@ -162,7 +162,9 @@ static int ptlrpcd(void *arg)
                 if (test_bit(LIOD_STOP, &pc->pc_flags))
                         break;
         }
-        /* XXX should be making sure we don't have anything in flight */
+        /* wait for inflight requests to drain */
+        if (!list_empty(&pc->pc_set->set_requests))
+                ptlrpc_set_wait(pc->pc_set);
         complete(&pc->pc_finishing);
         return 0;
 }
index 6b069a5..76469cb 100644 (file)
@@ -113,6 +113,10 @@ void ptlrpc_run_failed_import_upcall(struct obd_import* imp)
                        argv[0], argv[1], argv[2], argv[3], argv[4]);
         }
 #else
+        if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+                EXIT;
+                return;
+        }
         ptlrpc_recover_import(imp, NULL);
 #endif
 }
@@ -215,13 +219,8 @@ void ptlrpc_wake_delayed(struct obd_import *imp)
         list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 
-                if (req->rq_set) {
-                        DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
-                        wake_up(&req->rq_set->set_waitq);
-                } else {
-                        DEBUG_REQ(D_HA, req, "waking:");
-                        wake_up(&req->rq_reply_waitq);
-                }
+                DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+                ptlrpc_wake_client_req(req);
         }
         spin_unlock_irqrestore(&imp->imp_lock, flags);
 }
index 979355c..e07cae9 100644 (file)
 #include <portals/types.h>
 #include "ptlrpc_internal.h"
 
-extern int request_in_callback(ptl_event_t *ev);
+static LIST_HEAD (ptlrpc_all_services);
+static spinlock_t ptlrpc_all_services_lock = SPIN_LOCK_UNLOCKED;
 
-static int ptlrpc_check_event(struct ptlrpc_service *svc,
-                              struct ptlrpc_thread *thread, ptl_event_t *event)
+static void
+ptlrpc_free_server_req (struct ptlrpc_request *req)
 {
-        struct ptlrpc_srv_ni *srv_ni;
-        int i, idx, rc;
-        ENTRY;
+        /* The last request to be received into a request buffer uses space
+         * in the request buffer descriptor, otherwise requests are
+         * allocated dynamically in the incoming reply event handler */
+        if (req == &req->rq_rqbd->rqbd_req)
+                return;
 
-        spin_lock(&svc->srv_lock);
+        OBD_FREE(req, sizeof(*req));
+}
+        
+static char *
+ptlrpc_alloc_request_buffer (int size)
+{
+        char *ptr;
+        
+        if (size > SVC_BUF_VMALLOC_THRESHOLD)
+                OBD_VMALLOC(ptr, size);
+        else
+                OBD_ALLOC(ptr, size);
+        
+        return (ptr);
+}
 
-        if (thread->t_flags & SVC_STOPPING)
-                GOTO(out, rc = 1);
+static void
+ptlrpc_free_request_buffer (char *ptr, int size)
+{
+        if (size > SVC_BUF_VMALLOC_THRESHOLD)
+                OBD_VFREE(ptr, size);
+        else
+                OBD_FREE(ptr, size);
+}
 
-        LASSERT ((thread->t_flags & SVC_EVENT) == 0);
-        LASSERT (ptlrpc_ninterfaces > 0);
+struct ptlrpc_request_buffer_desc *
+ptlrpc_alloc_rqbd (struct ptlrpc_srv_ni *srv_ni)
+{
+        struct ptlrpc_service             *svc = srv_ni->sni_service;
+        unsigned long                      flags;
+        struct ptlrpc_request_buffer_desc *rqbd;
+
+        OBD_ALLOC(rqbd, sizeof (*rqbd));
+        if (rqbd == NULL)
+                return (NULL);
+
+        rqbd->rqbd_srv_ni = srv_ni;
+        rqbd->rqbd_refcount = 0;
+        rqbd->rqbd_cbid.cbid_fn = request_in_callback;
+        rqbd->rqbd_cbid.cbid_arg = rqbd;
+        rqbd->rqbd_buffer = ptlrpc_alloc_request_buffer(svc->srv_buf_size);
+
+        if (rqbd->rqbd_buffer == NULL) {
+                OBD_FREE(rqbd, sizeof (*rqbd));
+                return (NULL);
+        }
 
-        for (i = 0; i < ptlrpc_ninterfaces; i++) {
-                idx = (svc->srv_interface_rover + i) % ptlrpc_ninterfaces;
-                srv_ni = &svc->srv_interfaces[idx];
+        spin_lock_irqsave (&svc->srv_lock, flags);
+        list_add(&rqbd->rqbd_list, &srv_ni->sni_rqbds);
+        svc->srv_nbufs++;
+        spin_unlock_irqrestore (&svc->srv_lock, flags);
 
-                LASSERT (!PtlHandleEqual (srv_ni->sni_eq_h, PTL_HANDLE_NONE));
+        return (rqbd);
+}
 
-                rc = PtlEQGet(srv_ni->sni_eq_h, event);
-                switch (rc) {
-                case PTL_OK:
-                        /* next time start with the next interface */
-                        svc->srv_interface_rover = (idx+1) % ptlrpc_ninterfaces;
-                        thread->t_flags |= SVC_EVENT;
-                        GOTO(out, rc = 1);
+void
+ptlrpc_free_rqbd (struct ptlrpc_request_buffer_desc *rqbd) 
+{
+        struct ptlrpc_srv_ni  *sni = rqbd->rqbd_srv_ni;
+        struct ptlrpc_service *svc = sni->sni_service;
+        unsigned long          flags;
+        
+        LASSERT (rqbd->rqbd_refcount == 0);
+
+        spin_lock_irqsave(&svc->srv_lock, flags);
+        list_del(&rqbd->rqbd_list);
+        svc->srv_nbufs--;
+        spin_unlock_irqrestore(&svc->srv_lock, flags);
+
+        ptlrpc_free_request_buffer (rqbd->rqbd_buffer, svc->srv_buf_size);
+        OBD_FREE (rqbd, sizeof (*rqbd));
+}
 
-                case PTL_EQ_EMPTY:
-                        continue;
+void
+ptlrpc_save_lock (struct ptlrpc_request *req, 
+                  struct lustre_handle *lock, int mode)
+{
+        struct ptlrpc_reply_state *rs = req->rq_reply_state;
+        int                        idx;
 
-                case PTL_EQ_DROPPED:
-                        CWARN("Event queue overflow (bug 2125): timeouts will "
-                              "follow.\n");
-                        continue;
+        LASSERT (rs != NULL);
+        LASSERT (rs->rs_nlocks < RS_MAX_LOCKS);
+
+        idx = rs->rs_nlocks++;
+        rs->rs_locks[idx] = *lock;
+        rs->rs_modes[idx] = mode;
+        rs->rs_difficult = 1;
+}
+
+void
+ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs)
+{
+        struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
+
+#ifdef CONFIG_SMP
+        LASSERT (spin_is_locked (&svc->srv_lock));
+#endif
+        LASSERT (rs->rs_difficult);
+        rs->rs_scheduled_ever = 1;              /* flag any notification attempt */
+
+        if (rs->rs_scheduled)                   /* being set up or already notified */
+                return;
+
+        rs->rs_scheduled = 1;
+        list_del (&rs->rs_list);
+        list_add (&rs->rs_list, &svc->srv_reply_queue);
+        wake_up (&svc->srv_waitq);
+}
+
+void 
+ptlrpc_commit_replies (struct obd_device *obd)
+{
+        struct list_head   *tmp;
+        struct list_head   *nxt;
+        unsigned long       flags;
+        
+        /* Find any replies that have been committed and get their service
+         * to attend to complete them. */
+
+        /* CAVEAT EMPTOR: spinlock ordering!!! */
+        spin_lock_irqsave (&obd->obd_uncommitted_replies_lock, flags);
+
+        list_for_each_safe (tmp, nxt, &obd->obd_uncommitted_replies) {
+                struct ptlrpc_reply_state *rs =
+                        list_entry (tmp, struct ptlrpc_reply_state, rs_obd_list);
+
+                LASSERT (rs->rs_difficult);
 
-                default:
-                        CERROR("BUG: PtlEQGet returned %d\n", rc);
-                        LBUG();
+                if (rs->rs_transno <= obd->obd_last_committed) {
+                        struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
+
+                        spin_lock (&svc->srv_lock);
+                        list_del_init (&rs->rs_obd_list);
+                        ptlrpc_schedule_difficult_reply (rs);
+                        spin_unlock (&svc->srv_lock);
                 }
         }
-        rc = 0;
-        EXIT;
- out:
-        spin_unlock(&svc->srv_lock);
-        return rc;
+        
+        spin_unlock_irqrestore (&obd->obd_uncommitted_replies_lock, flags);
+}
+
+static long
+timeval_sub(struct timeval *large, struct timeval *small)
+{
+        return (large->tv_sec - small->tv_sec) * 1000000 +
+                (large->tv_usec - small->tv_usec);
 }
 
-struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
-                                        __u32 bufsize, __u32 max_req_size,
-                                        int req_portal, int rep_portal,
-                                        svc_handler_t handler, char *name,
-                                        struct proc_dir_entry *proc_entry)
+struct ptlrpc_service *
+ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
+                int req_portal, int rep_portal, 
+                svc_handler_t handler, char *name,
+                struct proc_dir_entry *proc_entry)
 {
-        int i, j, ssize, rc;
-        struct ptlrpc_service *service;
-        struct ptlrpc_srv_ni  *srv_ni;
+        int                                i;
+        int                                j;
+        int                                ssize;
+        struct ptlrpc_service             *service;
+        struct ptlrpc_srv_ni              *srv_ni;
+        struct ptlrpc_request_buffer_desc *rqbd;
         ENTRY;
 
         LASSERT (ptlrpc_ninterfaces > 0);
-
+        LASSERT (nbufs > 0);
+        LASSERT (bufsize >= max_req_size);
+        
         ssize = offsetof (struct ptlrpc_service,
                           srv_interfaces[ptlrpc_ninterfaces]);
         OBD_ALLOC(service, ssize);
@@ -108,11 +222,12 @@ struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
 
         service->srv_max_req_size = max_req_size;
         service->srv_buf_size = bufsize;
-
         service->srv_rep_portal = rep_portal;
         service->srv_req_portal = req_portal;
         service->srv_handler = handler;
-        service->srv_interface_rover = 0;
+
+        INIT_LIST_HEAD(&service->srv_request_queue);
+        INIT_LIST_HEAD(&service->srv_reply_queue);
 
         /* First initialise enough for early teardown */
         for (i = 0; i < ptlrpc_ninterfaces; i++) {
@@ -120,56 +235,31 @@ struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
 
                 srv_ni->sni_service = service;
                 srv_ni->sni_ni = &ptlrpc_interfaces[i];
-                srv_ni->sni_eq_h = PTL_HANDLE_NONE;
                 INIT_LIST_HEAD(&srv_ni->sni_rqbds);
-                srv_ni->sni_nrqbds = 0;
-                atomic_set(&srv_ni->sni_nrqbds_receiving, 0);
+                INIT_LIST_HEAD(&srv_ni->sni_active_replies);
         }
 
-        /* Now allocate the event queue and request buffers, assuming all
-         * interfaces require the same level of buffering. */
+        spin_lock (&ptlrpc_all_services_lock);
+        list_add (&service->srv_list, &ptlrpc_all_services);
+        spin_unlock (&ptlrpc_all_services_lock);
+        
+        /* Now allocate the request buffers, assuming all interfaces require
+         * the same number. */
         for (i = 0; i < ptlrpc_ninterfaces; i++) {
                 srv_ni = &service->srv_interfaces[i];
                 CDEBUG (D_NET, "%s: initialising interface %s\n", name,
                         srv_ni->sni_ni->pni_name);
 
-                rc = PtlEQAlloc(srv_ni->sni_ni->pni_ni_h, nevents,
-                                request_in_callback, &(srv_ni->sni_eq_h));
-                if (rc != PTL_OK) {
-                        CERROR("%s.%d: PtlEQAlloc on %s failed: %d\n",
-                               name, i, srv_ni->sni_ni->pni_name, rc);
-                        GOTO (failed, NULL);
-                }
-
                 for (j = 0; j < nbufs; j++) {
-                        struct ptlrpc_request_buffer_desc *rqbd;
-
-                        OBD_ALLOC_WAIT(rqbd, sizeof(*rqbd));
+                        rqbd = ptlrpc_alloc_rqbd (srv_ni);
+                        
                         if (rqbd == NULL) {
-                                CERROR ("%s.%d: Can't allocate request "
-                                        "descriptor %d on %s\n",
-                                        name, i, srv_ni->sni_nrqbds,
+                                CERROR ("%s.%d: Can't allocate request %d "
+                                        "on %s\n", name, i, j, 
                                         srv_ni->sni_ni->pni_name);
                                 GOTO(failed, NULL);
                         }
-
-                        rqbd->rqbd_srv_ni = srv_ni;
-                        rqbd->rqbd_me_h = PTL_HANDLE_NONE;
-                        atomic_set(&rqbd->rqbd_refcount, 0);
-
-                        OBD_ALLOC_WAIT(rqbd->rqbd_buffer, service->srv_buf_size);
-                        if (rqbd->rqbd_buffer == NULL) {
-                                CERROR ("%s.%d: Can't allocate request "
-                                        "buffer %d on %s\n",
-                                        name, i, srv_ni->sni_nrqbds,
-                                        srv_ni->sni_ni->pni_name);
-                                OBD_FREE(rqbd, sizeof(*rqbd));
-                                GOTO(failed, NULL);
-                        }
-                        list_add(&rqbd->rqbd_list, &srv_ni->sni_rqbds);
-                        srv_ni->sni_nrqbds++;
-
-                        ptlrpc_link_svc_me(rqbd);
+                        ptlrpc_register_rqbd (rqbd);
                 }
         }
 
@@ -185,30 +275,47 @@ failed:
         return NULL;
 }
 
-static int handle_incoming_request(struct obd_device *obddev,
-                                   struct ptlrpc_service *svc,
-                                   ptl_event_t *event,
-                                   struct ptlrpc_request *request)
+static int 
+ptlrpc_server_handle_request (struct ptlrpc_service *svc)
 {
-        struct ptlrpc_request_buffer_desc *rqbd = event->mem_desc.user_ptr;
-        int rc;
-
-        /* FIXME: If we move to an event-driven model, we should put the request
-         * on the stack of mds_handle instead. */
+        struct ptlrpc_request *request;
+        unsigned long          flags;
+        struct timeval         work_start;
+        struct timeval         work_end;
+        long                   timediff;
+        int                    refcount;
+        int                    rc;
+        ENTRY;
 
-        LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0);
-        LASSERT ((event->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
-        LASSERT (rqbd->rqbd_srv_ni->sni_service == svc);
-        LASSERT (rqbd->rqbd_buffer == event->mem_desc.start);
-        LASSERT (event->offset + event->mlength <= svc->srv_buf_size);
+        spin_lock_irqsave (&svc->srv_lock, flags);
+        if (list_empty (&svc->srv_request_queue) ||
+            (svc->srv_n_difficult_replies != 0 &&
+             svc->srv_n_active_reqs >= (svc->srv_nthreads - 1))) {
+                /* If all the other threads are handling requests, I must
+                 * remain free to handle any 'difficult' reply that might
+                 * block them */
+                spin_unlock_irqrestore (&svc->srv_lock, flags);
+                RETURN(0);
+        }
 
-        memset(request, 0, sizeof(*request));
-        spin_lock_init (&request->rq_lock);
-        INIT_LIST_HEAD(&request->rq_list);
-        request->rq_svc = svc;
-        request->rq_xid = event->match_bits;
-        request->rq_reqmsg = event->mem_desc.start + event->offset;
-        request->rq_reqlen = event->mlength;
+        request = list_entry (svc->srv_request_queue.next,
+                              struct ptlrpc_request, rq_list);
+        list_del_init (&request->rq_list);
+        svc->srv_n_queued_reqs--;
+        svc->srv_n_active_reqs++;
+
+        spin_unlock_irqrestore (&svc->srv_lock, flags);
+
+        do_gettimeofday(&work_start);
+        timediff = timeval_sub(&work_start, &request->rq_arrival_time);
+        if (svc->srv_stats != NULL) {
+                lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
+                                    timediff);
+                lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
+                                    svc->srv_n_queued_reqs);
+                lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
+                                    svc->srv_n_active_reqs);
+        }
 
 #if SWAB_PARANOIA
         /* Clear request swab mask; this is a new request */
@@ -218,26 +325,34 @@ static int handle_incoming_request(struct obd_device *obddev,
         if (rc != 0) {
                 CERROR ("error unpacking request: ptl %d from "LPX64
                         " xid "LPU64"\n", svc->srv_req_portal,
-                       event->initiator.nid, request->rq_xid);
+                       request->rq_peer.peer_nid, request->rq_xid);
                 goto out;
         }
+
         rc = -EINVAL;
         if (request->rq_reqmsg->type != PTL_RPC_MSG_REQUEST) {
-                CERROR("wrong packet type received (type=%u)\n",
-                       request->rq_reqmsg->type);
+                CERROR("wrong packet type received (type=%u) from "
+                       LPX64"\n", request->rq_reqmsg->type,
+                       request->rq_peer.peer_nid);
                 goto out;
         }
 
-        CDEBUG(D_NET, "got req "LPD64" (md: %p + %d)\n", request->rq_xid,
-               event->mem_desc.start, event->offset);
+        CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid);
 
-        request->rq_peer.peer_nid = event->initiator.nid;
-        request->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni;
+        /* Discard requests queued for longer than my timeout.  If the
+         * client's timeout is similar to mine, she'll be timing out this
+         * REQ anyway (bug 1502) */
+        if (timediff / 1000000 > (long)obd_timeout) {
+                CERROR("Dropping timed-out request from "LPX64
+                       ": %ld seconds old\n",
+                       request->rq_peer.peer_nid, timediff / 1000000);
+                goto out;
+        }
 
         request->rq_export = class_conn2export(&request->rq_reqmsg->handle);
 
         if (request->rq_export) {
-                if (request->rq_reqmsg->conn_cnt < 
+                if (request->rq_reqmsg->conn_cnt <
                     request->rq_export->exp_conn_cnt) {
                         DEBUG_REQ(D_ERROR, request,
                                   "DROPPING req from old connection %d < %d",
@@ -248,7 +363,7 @@ static int handle_incoming_request(struct obd_device *obddev,
 
                 request->rq_export->exp_last_request_time =
                         LTIME_S(CURRENT_TIME);
-        } 
+        }
 
         CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:ni:nid:opc "
                "%s:%s+%d:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
@@ -257,7 +372,8 @@ static int handle_incoming_request(struct obd_device *obddev,
                (request->rq_export ?
                 atomic_read(&request->rq_export->exp_refcount) : -99),
                request->rq_reqmsg->status, request->rq_xid,
-               rqbd->rqbd_srv_ni->sni_ni->pni_name, event->initiator.nid,
+               request->rq_peer.peer_ni->pni_name,
+               request->rq_peer.peer_nid,
                request->rq_reqmsg->opc);
 
         rc = svc->srv_handler(request);
@@ -268,7 +384,8 @@ static int handle_incoming_request(struct obd_device *obddev,
                (request->rq_export ?
                 atomic_read(&request->rq_export->exp_refcount) : -99),
                request->rq_reqmsg->status, request->rq_xid,
-               rqbd->rqbd_srv_ni->sni_ni->pni_name, event->initiator.nid,
+               request->rq_peer.peer_ni->pni_name,
+               request->rq_peer.peer_nid,
                request->rq_reqmsg->opc);
 
 put_conn:
@@ -276,12 +393,175 @@ put_conn:
                 class_export_put(request->rq_export);
 
  out:
-        if (atomic_dec_and_test (&rqbd->rqbd_refcount)) /* last reference? */
-                ptlrpc_link_svc_me (rqbd);
+        do_gettimeofday(&work_end);
+
+        timediff = timeval_sub(&work_end, &work_start);
+
+        CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
+               "request "LPU64" opc %u from NID "LPX64" processed in %ldus "
+               "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc,
+               request->rq_peer.peer_nid,
+               timediff, timeval_sub(&work_end, &request->rq_arrival_time));
+
+        if (svc->srv_stats != NULL) {
+                int opc = opcode_offset(request->rq_reqmsg->opc);
+                if (opc > 0) {
+                        LASSERT(opc < LUSTRE_MAX_OPCODES);
+                        lprocfs_counter_add(svc->srv_stats,
+                                            opc + PTLRPC_LAST_CNTR,
+                                            timediff);
+                }
+        }
+
+        spin_lock_irqsave(&svc->srv_lock, flags);
+        svc->srv_n_active_reqs--;
+        refcount = --(request->rq_rqbd->rqbd_refcount);
+        spin_unlock_irqrestore(&svc->srv_lock, flags);
+
+        if (refcount == 0) {
+                /* rqbd now idle: repost */
+                ptlrpc_register_rqbd(request->rq_rqbd);
+        }
+
+        ptlrpc_free_server_req(request);
+
+        RETURN(1);
+}
+
+static int
+ptlrpc_server_handle_reply (struct ptlrpc_service *svc) 
+{
+        struct ptlrpc_reply_state *rs;
+        unsigned long              flags;
+        struct obd_export         *exp;
+        struct obd_device         *obd;
+        int                        nlocks;
+        int                        been_handled;
+        ENTRY;
+
+        spin_lock_irqsave (&svc->srv_lock, flags);
+        if (list_empty (&svc->srv_reply_queue)) {
+                spin_unlock_irqrestore (&svc->srv_lock, flags);
+                RETURN(0);
+        }
+        
+        rs = list_entry (svc->srv_reply_queue.next,
+                         struct ptlrpc_reply_state, rs_list);
+
+        exp = rs->rs_export;
+        obd = exp->exp_obd;
+
+        LASSERT (rs->rs_difficult);
+        LASSERT (rs->rs_scheduled);
+
+        list_del_init (&rs->rs_list);
+
+        /* Disengage from notifiers carefully (lock ordering!) */
+        spin_unlock(&svc->srv_lock);
+
+        spin_lock (&obd->obd_uncommitted_replies_lock);
+        /* Noop if removed already */
+        list_del_init (&rs->rs_obd_list);
+        spin_unlock (&obd->obd_uncommitted_replies_lock);
+
+        spin_lock (&exp->exp_lock);
+        /* Noop if removed already */
+        list_del_init (&rs->rs_exp_list);
+        spin_unlock (&exp->exp_lock);
+
+        spin_lock(&svc->srv_lock);
+
+        been_handled = rs->rs_handled;
+        rs->rs_handled = 1;
+        
+        nlocks = rs->rs_nlocks;                 /* atomic "steal", but */
+        rs->rs_nlocks = 0;                      /* locks still on rs_locks! */
+
+        if (nlocks == 0 && !been_handled) {
+                /* If we see this, we should already have seen the warning
+                 * in mds_steal_ack_locks()  */
+                CWARN("All locks stolen from rs %p x"LPD64".t"LPD64
+                      " o%d NID"LPX64"\n",
+                      rs, 
+                      rs->rs_xid, rs->rs_transno,
+                      rs->rs_msg.opc, exp->exp_connection->c_peer.peer_nid);
+        }
+
+        if ((!been_handled && rs->rs_on_net) || 
+            nlocks > 0) {
+                spin_unlock_irqrestore(&svc->srv_lock, flags);
+                
+                if (!been_handled && rs->rs_on_net) {
+                        PtlMDUnlink(rs->rs_md_h);
+                        /* Ignore return code; we're racing with
+                         * completion... */
+                }
+
+                while (nlocks-- > 0)
+                        ldlm_lock_decref(&rs->rs_locks[nlocks], 
+                                         rs->rs_modes[nlocks]);
+
+                spin_lock_irqsave(&svc->srv_lock, flags);
+        }
+
+        rs->rs_scheduled = 0;
+
+        if (!rs->rs_on_net) {
+                /* Off the net */
+                svc->srv_n_difficult_replies--;
+                spin_unlock_irqrestore(&svc->srv_lock, flags);
+                
+                class_export_put (exp);
+                rs->rs_export = NULL;
+                lustre_free_reply_state (rs);
+                atomic_dec (&svc->srv_outstanding_replies);
+                RETURN(1);
+        }
+        
+        /* still on the net; callback will schedule */
+        spin_unlock_irqrestore (&svc->srv_lock, flags);
+        RETURN(1);
+}
+
+#ifndef __KERNEL__
+/* FIXME make use of timeout later */
+int
+liblustre_check_services (void *arg) 
+{
+        int  did_something = 0;
+        struct list_head *tmp, *nxt;
+        ENTRY;
+        
+        /* I'm relying on being single threaded, not to have to lock
+         * ptlrpc_all_services etc */
+        list_for_each_safe (tmp, nxt, &ptlrpc_all_services) {
+                struct ptlrpc_service *svc =
+                        list_entry (tmp, struct ptlrpc_service, srv_list);
+                
+                if (svc->srv_nthreads != 0)     /* I've recursed */
+                        continue;
+
+                /* service threads can block for bulk, so this limits us
+                 * (arbitrarily) to recursing 1 stack frame per service.
+                 * Note that the problem with recursion is that we have to
+                 * unwind completely before our caller can resume. */
+                
+                svc->srv_nthreads++;
+                
+                while (ptlrpc_server_handle_reply (svc))
+                        did_something++;
+                        
+                while (ptlrpc_server_handle_request (svc))
+                        did_something++;
+                        
+                svc->srv_nthreads--;
+        }
 
-        return rc;
+        RETURN(did_something);
 }
 
+#else /* __KERNEL__ */
+
 /* Don't use daemonize, it removes fs struct from new thread (bug 418) */
 void ptlrpc_daemonize(void)
 {
@@ -291,24 +571,12 @@ void ptlrpc_daemonize(void)
         reparent_to_init();
 }
 
-static long timeval_sub(struct timeval *large, struct timeval *small)
-{
-        return (large->tv_sec - small->tv_sec) * 1000000 +
-                (large->tv_usec - small->tv_usec);
-}
-
 static int ptlrpc_main(void *arg)
 {
-        struct ptlrpc_svc_data *data = arg;
-        struct obd_device *obddev = data->dev;
-        struct ptlrpc_service *svc = data->svc;
-        struct ptlrpc_thread *thread = data->thread;
-        struct ptlrpc_request *request;
-        ptl_event_t *event;
-        unsigned long flags;
-        struct timeval start_time, finish_time;
-        long total;
-        int rc = 0;
+        struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg;
+        struct ptlrpc_service  *svc = data->svc;
+        struct ptlrpc_thread   *thread = data->thread;
+        unsigned long           flags;
         ENTRY;
 
         lock_kernel();
@@ -322,134 +590,91 @@ static int ptlrpc_main(void *arg)
         THREAD_NAME(current->comm, "%s", data->name);
         unlock_kernel();
 
-        OBD_ALLOC(event, sizeof(*event));
-        if (event == NULL)
-                GOTO(out, rc = -ENOMEM);
-        OBD_ALLOC(request, sizeof(*request));
-        if (request == NULL)
-                GOTO(out_event, rc = -ENOMEM);
-
         /* Record that the thread is running */
         thread->t_flags = SVC_RUNNING;
         wake_up(&thread->t_ctl_waitq);
 
+        spin_lock_irqsave(&svc->srv_lock, flags);
+        svc->srv_nthreads++;
+        spin_unlock_irqrestore(&svc->srv_lock, flags);
+        
         /* XXX maintain a list of all managed devices: insert here */
 
-        do_gettimeofday(&finish_time);
-        /* And now, loop forever on requests */
-        while (1) {
+        while ((thread->t_flags & SVC_STOPPING) == 0 ||
+               svc->srv_n_difficult_replies != 0) {
+                /* Don't exit while there are replies to be handled */
                 struct l_wait_info lwi = { 0 };
-                l_wait_event_exclusive(svc->srv_waitq,
-                                       ptlrpc_check_event(svc, thread, event),
-                                       &lwi);
-
-                spin_lock(&svc->srv_lock);
-                if (thread->t_flags & SVC_STOPPING) {
-                        thread->t_flags &= ~SVC_STOPPING;
-                        spin_unlock(&svc->srv_lock);
-
-                        EXIT;
-                        break;
-                }
-
-                if (!(thread->t_flags & SVC_EVENT)) {
-                        CERROR("unknown flag in service");
-                        spin_unlock(&svc->srv_lock);
-                        LBUG();
-                        EXIT;
-                        break;
-                }
-
-                thread->t_flags &= ~SVC_EVENT;
-                spin_unlock(&svc->srv_lock);
-
-                do_gettimeofday(&start_time);
-                total = timeval_sub(&start_time, &event->arrival_time);
-                if (svc->srv_stats != NULL) {
-                        lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
-                                            total);
-                        lprocfs_counter_add(svc->srv_stats,
-                                            PTLRPC_SVCIDLETIME_CNTR,
-                                            timeval_sub(&start_time,
-                                                        &finish_time));
-#if 0 /* Wait for b_eq branch */
-                        lprocfs_counter_add(svc->srv_stats,
-                                            PTLRPC_SVCEQDEPTH_CNTR, 0);
-#endif
-                }
 
-                if (total / 1000000 > (long)obd_timeout) {
-                        CERROR("Dropping request from NID "LPX64" because it's "
-                               "%ld seconds old.\n", event->initiator.nid,
-                               total / 1000000); /* bug 1502 */
-                } else {
-                        CDEBUG(D_HA, "request from NID "LPX64" noticed after "
-                               "%ldus\n", event->initiator.nid, total);
-                        rc = handle_incoming_request(obddev, svc, event,
-                                                     request);
-                }
-                do_gettimeofday(&finish_time);
-                total = timeval_sub(&finish_time, &start_time);
-
-                CDEBUG((total / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
-                       "request "LPU64" from NID "LPX64" processed in %ldus "
-                       "(%ldus total)\n", request->rq_xid, event->initiator.nid,
-                       total, timeval_sub(&finish_time, &event->arrival_time));
-
-                if (svc->srv_stats != NULL) {
-                        int opc = opcode_offset(request->rq_reqmsg->opc);
-                        if (opc > 0) {
-                                LASSERT(opc < LUSTRE_MAX_OPCODES);
-                                lprocfs_counter_add(svc->srv_stats,
-                                                    opc + PTLRPC_LAST_CNTR,
-                                                    total);
-                        }
-                }
+                l_wait_event_exclusive (svc->srv_waitq,
+                              (thread->t_flags & SVC_STOPPING) != 0 ||
+                              !list_empty (&svc->srv_reply_queue) ||
+                              (!list_empty (&svc->srv_request_queue) &&
+                               (svc->srv_n_difficult_replies == 0 ||
+                                svc->srv_n_active_reqs < 
+                                (svc->srv_nthreads - 1))),
+                              &lwi);
+
+                if (!list_empty (&svc->srv_reply_queue))
+                        ptlrpc_server_handle_reply (svc);
+
+                /* only handle requests if there are no difficult replies
+                 * outstanding, or I'm not the last thread handling
+                 * requests */
+                if (!list_empty (&svc->srv_request_queue) &&
+                    (svc->srv_n_difficult_replies == 0 ||
+                     svc->srv_n_active_reqs < (svc->srv_nthreads - 1)))
+                        ptlrpc_server_handle_request (svc);
         }
 
-        /* NB should wait for all SENT callbacks to complete before exiting
-         * here.  Unfortunately at this time there is no way to track this
-         * state. */
-        OBD_FREE(request, sizeof(*request));
-out_event:
-        OBD_FREE(event, sizeof(*event));
-out:
+        spin_lock_irqsave(&svc->srv_lock, flags);
+
+        svc->srv_nthreads--;                    /* must know immediately */
         thread->t_flags = SVC_STOPPED;
         wake_up(&thread->t_ctl_waitq);
 
-        CDEBUG(D_NET, "service thread exiting, process %d: rc = %d\n",
-               current->pid, rc);
-        return rc;
+        spin_unlock_irqrestore(&svc->srv_lock, flags);
+
+        CDEBUG(D_NET, "service thread exiting, process %d\n", current->pid);
+        return 0;
 }
 
 static void ptlrpc_stop_thread(struct ptlrpc_service *svc,
                                struct ptlrpc_thread *thread)
 {
         struct l_wait_info lwi = { 0 };
+        unsigned long      flags;
 
-        spin_lock(&svc->srv_lock);
+        spin_lock_irqsave(&svc->srv_lock, flags);
         thread->t_flags = SVC_STOPPING;
-        spin_unlock(&svc->srv_lock);
+        spin_unlock_irqrestore(&svc->srv_lock, flags);
 
         wake_up_all(&svc->srv_waitq);
         l_wait_event(thread->t_ctl_waitq, (thread->t_flags & SVC_STOPPED),
                      &lwi);
+
+        spin_lock_irqsave(&svc->srv_lock, flags);
+        list_del(&thread->t_link);
+        spin_unlock_irqrestore(&svc->srv_lock, flags);
+        
+        OBD_FREE(thread, sizeof(*thread));
 }
 
 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
 {
-        spin_lock(&svc->srv_lock);
+        unsigned long flags;
+        struct ptlrpc_thread *thread;
+
+        spin_lock_irqsave(&svc->srv_lock, flags);
         while (!list_empty(&svc->srv_threads)) {
-                struct ptlrpc_thread *thread;
-                thread = list_entry(svc->srv_threads.next, struct ptlrpc_thread,
-                                    t_link);
-                spin_unlock(&svc->srv_lock);
+                thread = list_entry(svc->srv_threads.next, 
+                                    struct ptlrpc_thread, t_link);
+
+                spin_unlock_irqrestore(&svc->srv_lock, flags);
                 ptlrpc_stop_thread(svc, thread);
-                spin_lock(&svc->srv_lock);
-                list_del(&thread->t_link);
-                OBD_FREE(thread, sizeof(*thread));
+                spin_lock_irqsave(&svc->srv_lock, flags);
         }
-        spin_unlock(&svc->srv_lock);
+
+        spin_unlock_irqrestore(&svc->srv_lock, flags);
 }
 
 int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc,
@@ -477,6 +702,7 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
         struct l_wait_info lwi = { 0 };
         struct ptlrpc_svc_data d;
         struct ptlrpc_thread *thread;
+        unsigned long flags;
         int rc;
         ENTRY;
 
@@ -484,15 +710,15 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
         if (thread == NULL)
                 RETURN(-ENOMEM);
         init_waitqueue_head(&thread->t_ctl_waitq);
-
+        
         d.dev = dev;
         d.svc = svc;
         d.name = name;
         d.thread = thread;
 
-        spin_lock(&svc->srv_lock);
+        spin_lock_irqsave(&svc->srv_lock, flags);
         list_add(&thread->t_link, &svc->srv_threads);
-        spin_unlock(&svc->srv_lock);
+        spin_unlock_irqrestore(&svc->srv_lock, flags);
 
         /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
          * just drop the VM and FILES in ptlrpc_daemonize() right away.
@@ -507,65 +733,126 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
 
         RETURN(0);
 }
+#endif
 
 int ptlrpc_unregister_service(struct ptlrpc_service *service)
 {
-        int i, rc;
+        int                   i;
+        int                   rc;
+        unsigned long         flags;
         struct ptlrpc_srv_ni *srv_ni;
+        struct l_wait_info    lwi;
+        struct list_head     *tmp;
 
-        LASSERT (list_empty (&service->srv_threads));
+        LASSERT(list_empty(&service->srv_threads));
 
-        /* XXX We could reply (with failure) to all buffered requests
-         * _after_ unlinking _all_ the request buffers, but _before_
-         * freeing them.
-         */
+        spin_lock (&ptlrpc_all_services_lock);
+        list_del_init (&service->srv_list);
+        spin_unlock (&ptlrpc_all_services_lock);
+
+        for (i = 0; i < ptlrpc_ninterfaces; i++) {
+                srv_ni = &service->srv_interfaces[i];
+                CDEBUG(D_NET, "%s: tearing down interface %s\n",
+                       service->srv_name, srv_ni->sni_ni->pni_name);
+
+                /* Unlink all the request buffers.  This forces a 'final'
+                 * event with its 'unlink' flag set for each rqbd */
+                list_for_each(tmp, &srv_ni->sni_rqbds) {
+                        struct ptlrpc_request_buffer_desc *rqbd =
+                                list_entry(tmp, struct ptlrpc_request_buffer_desc, 
+                                           rqbd_list);
+
+                        rc = PtlMDUnlink(rqbd->rqbd_md_h);
+                        LASSERT (rc == PTL_OK || rc == PTL_INV_MD);
+                }
+
+                /* Wait for the network to release any buffers it's
+                 * currently filling */
+                for (;;) {
+                        spin_lock_irqsave(&service->srv_lock, flags);
+                        rc = srv_ni->sni_nrqbd_receiving;
+                        spin_unlock_irqrestore(&service->srv_lock, flags);
+
+                        if (rc == 0)
+                                break;
+                        
+                        /* Network access will complete in finite time but
+                         * the HUGE timeout lets us CWARN for visibility of
+                         * sluggish NALs */
+                        lwi = LWI_TIMEOUT(300 * HZ, NULL, NULL);
+                        rc = l_wait_event(service->srv_waitq,
+                                          srv_ni->sni_nrqbd_receiving == 0,
+                                          &lwi);
+                        if (rc == -ETIMEDOUT)
+                                CWARN("Waiting for request buffers on "
+                                      "service %s on interface %s ",
+                                      service->srv_name, srv_ni->sni_ni->pni_name);
+                }
+
+                /* schedule all outstanding replies to terminate them */
+                spin_lock_irqsave(&service->srv_lock, flags);
+                while (!list_empty(&srv_ni->sni_active_replies)) {
+                        struct ptlrpc_reply_state *rs =
+                                list_entry(srv_ni->sni_active_replies.next,
+                                           struct ptlrpc_reply_state,
+                                           rs_list);
+                        ptlrpc_schedule_difficult_reply(rs);
+                }
+                spin_unlock_irqrestore(&service->srv_lock, flags);
+        }
+
+        /* purge the request queue.  NB No new replies (rqbds all unlinked)
+         * and no service threads, so I'm the only thread noodling the
+         * request queue now */
+        while (!list_empty(&service->srv_request_queue)) {
+                struct ptlrpc_request *req =
+                        list_entry(service->srv_request_queue.next,
+                                   struct ptlrpc_request,
+                                   rq_list);
+                
+                list_del(&req->rq_list);
+                service->srv_n_queued_reqs--;
+                req->rq_rqbd->rqbd_refcount--;
+                
+                ptlrpc_free_server_req(req);
+        }
+        LASSERT(service->srv_n_queued_reqs == 0);
 
+        /* Now free all the request buffers since nothing references them
+         * any more... */
         for (i = 0; i < ptlrpc_ninterfaces; i++) {
                 srv_ni = &service->srv_interfaces[i];
-                CDEBUG (D_NET, "%s: tearing down interface %s\n",
-                        service->srv_name, srv_ni->sni_ni->pni_name);
 
-                while (!list_empty (&srv_ni->sni_rqbds)) {
+                while (!list_empty(&srv_ni->sni_rqbds)) {
                         struct ptlrpc_request_buffer_desc *rqbd =
-                                list_entry (srv_ni->sni_rqbds.next,
-                                            struct ptlrpc_request_buffer_desc,
-                                            rqbd_list);
-
-                        list_del (&rqbd->rqbd_list);
-
-                        LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0);
-                        /* refcount could be anything; it's possible for
-                         * the buffers to continued to get filled after all
-                         * the server threads exited.  But we know they
-                         * _have_ exited.
-                         */
-
-                        (void) PtlMEUnlink(rqbd->rqbd_me_h);
-                        /* The callback handler could have unlinked this ME
-                         * already (we're racing with her) but it's safe to
-                         * ensure it _has_ been unlinked.
-                         */
-
-                        OBD_FREE (rqbd->rqbd_buffer, service->srv_buf_size);
-                        OBD_FREE (rqbd, sizeof (*rqbd));
-                        srv_ni->sni_nrqbds--;
+                                list_entry(srv_ni->sni_rqbds.next,
+                                           struct ptlrpc_request_buffer_desc, 
+                                           rqbd_list);
+
+                        ptlrpc_free_rqbd(rqbd);
                 }
+        }
 
-                LASSERT (srv_ni->sni_nrqbds == 0);
+        /* wait for all outstanding replies to complete (they were
+         * scheduled having been flagged to abort above) */
+        while (atomic_read(&service->srv_outstanding_replies) != 0) {
+                struct l_wait_info lwi = LWI_TIMEOUT(10 * HZ, NULL, NULL);
 
-                if (!PtlHandleEqual (srv_ni->sni_eq_h, PTL_HANDLE_NONE)) {
-                        rc = PtlEQFree(srv_ni->sni_eq_h);
-                        if (rc)
-                                CERROR("%s.%d: PtlEQFree failed on %s: %d\n",
-                                       service->srv_name, i,
-                                       srv_ni->sni_ni->pni_name, rc);
+                rc = l_wait_event(service->srv_waitq,
+                                  !list_empty(&service->srv_reply_queue), &lwi);
+                LASSERT(rc == 0 || rc == -ETIMEDOUT);
+
+                if (rc == 0) {
+                        ptlrpc_server_handle_reply(service);
+                        continue;
                 }
+                CWARN("Unexpectedly long timeout %p\n", service);
         }
 
         ptlrpc_lprocfs_unregister_service(service);
 
         OBD_FREE(service,
-                 offsetof (struct ptlrpc_service,
-                           srv_interfaces[ptlrpc_ninterfaces]));
+                 offsetof(struct ptlrpc_service,
+                          srv_interfaces[ptlrpc_ninterfaces]));
         return 0;
 }
index b7f2b83..9d02148 100644 (file)
@@ -1,5 +1,5 @@
 # lustre.spec
-%define version HEAD
+%define version b_eq
 %define kversion @LINUXRELEASE@
 %define linuxdir @LINUX@
 %define enable_doc @ENABLE_DOC@
@@ -143,9 +143,7 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre
 %attr(-, root, root) /usr/lib/lustre/examples/llechocleanup.sh
 
 %attr(-, root, root) /etc/init.d/lustre
-%attr(-, root, root) /lib/libportals.a
 %attr(-, root, root) /lib/libptlctl.a
-%attr(-, root, root) /lib/libtcpnal.a
 %attr(-, root, root) /lib/liblustreapi.a
 %attr(-, root, root) /usr/include/lustre/*.h
 
index 34ba231..9380a2d 100644 (file)
@@ -183,9 +183,10 @@ test_5() {
        # if all the modules have unloaded.
        umount $MOUNT &
        UMOUNT_PID=$!
-       sleep $TIMEOUT
+       sleep 2
        echo "killing umount"
        kill -TERM $UMOUNT_PID
+       echo "waiting for umount to finish"
        wait $UMOUNT_PID 
 
        # cleanup client modules
@@ -200,6 +201,48 @@ test_5() {
 }
 run_test 5 "force cleanup mds, then cleanup"
 
+test_5b() {
+       start_ost
+       start_mds
+       stop_mds
+
+       [ -d $MOUNT ] || mkdir -p $MOUNT
+       $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null 
+       llmount $mds_HOST://mds_svc/client_facet $MOUNT  && exit 1
+
+       # cleanup client modules
+       $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null 
+       
+       # stop_mds is a no-op here, and should not fail
+       stop_mds || return 2
+       stop_ost || return 3
+
+       lsmod | grep -q portals && return 3
+       return 0
+
+}
+run_test 5b "mds down, cleanup after failed mount (bug 2712)"
+
+test_5c() {
+       start_ost
+       start_mds
+
+       [ -d $MOUNT ] || mkdir -p $MOUNT
+       $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null 
+       llmount $mds_HOST://wrong_mds_svc/client_facet $MOUNT  && exit 1
+
+       # cleanup client modules
+       $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null 
+       
+       stop_mds || return 2
+       stop_ost || return 3
+
+       lsmod | grep -q portals && return 3
+       return 0
+
+}
+run_test 5c "cleanup after failed mount (bug 2712)"
+
 test_6() {
        setup
        manual_umount_client
index 1aabb7d..0861045 100755 (executable)
@@ -80,7 +80,7 @@ test_2() {
     done 
     fail ost
     for i in `seq 10`; do
-      grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i"
+      grep -q "tag-$i" $DIR/$tfile-$i || error "f2-$i"
     done 
 }
 run_test 2 "|x| 10 open(O_CREAT)s"
index 6e3aad9..0a3f785 100644 (file)
@@ -3,8 +3,19 @@ DEFS=
 SUBDIRS = Lustre
 
 CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include  -I$(srcdir)/../include -Wall -L../portals/utils
-KFLAGS:=
 CPPFLAGS = $(HAVE_LIBREADLINE)
+
+if LIBLUSTRE
+
+bin_SCRIPTS = lrun
+
+EXTRA_DIST = $(sbin_SCRIPTS)
+
+include $(top_srcdir)/Rules
+
+else
+
+KFLAGS:=
 lctl_LDADD := $(LIBREADLINE) -lptlctl
 lfs_LDADD := $(LIBREADLINE) parser.o liblustreapi.a -lptlctl obd.o
 lload_LDADD := -lptlctl
@@ -37,3 +48,5 @@ newwiretest: wirehdr.c wirecheck
 
 mount.lustre$(EXEEXT): llmount
        cp llmount mount.lustre
+
+endif
index 193a062..56d3d04 100755 (executable)
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 LIBLUSTRE_MOUNT_POINT=${LIBLUSTRE_MOUNT_POINT:-"/mnt/lustre"}
-LIBLUSTRE_MOUNT_TARGET=${LIBLUSTRE_MOUNT_TARGET:-""}
+LIBLUSTRE_MOUNT_TARGET=${LIBLUSTRE_MOUNT_TARGET:-"TARGET_NOT_SET"}
 LIBLUSTRE_DUMPFILE=${LIBLUSTRE_DUMPFILE:-"/tmp/DUMP_FILE"}
 LD_PRELOAD=${LD_PRELOAD:-"/usr/lib/liblustre.so"}