Whamcloud - gitweb
* Landed b1_2_singleportals
authoreeb <eeb>
Thu, 28 Oct 2004 12:05:35 +0000 (12:05 +0000)
committereeb <eeb>
Thu, 28 Oct 2004 12:05:35 +0000 (12:05 +0000)
136 files changed:
lustre/autoMakefile.am
lustre/configure.in
lustre/include/liblustre.h
lustre/include/linux/lustre_net.h
lustre/include/linux/obd_class.h
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/llite/llite_lib.c
lustre/mds/mds_reint.c
lustre/obdclass/class_obd.c
lustre/obdclass/lustre_peer.c
lustre/obdclass/obd_config.c
lustre/ost/ost_handler.c
lustre/portals/archdep.m4
lustre/portals/build.m4
lustre/portals/include/linux/kp30.h
lustre/portals/include/linux/kpr.h
lustre/portals/include/linux/libcfs.h
lustre/portals/include/linux/portals_compat25.h
lustre/portals/include/linux/portals_lib.h
lustre/portals/include/portals/Makefile.am
lustre/portals/include/portals/api-support.h
lustre/portals/include/portals/api.h
lustre/portals/include/portals/arg-blocks.h [deleted file]
lustre/portals/include/portals/build_check.h
lustre/portals/include/portals/defines.h [deleted file]
lustre/portals/include/portals/errno.h
lustre/portals/include/portals/lib-dispatch.h [deleted file]
lustre/portals/include/portals/lib-nal.h [deleted file]
lustre/portals/include/portals/lib-p30.h
lustre/portals/include/portals/lib-types.h
lustre/portals/include/portals/nal.h
lustre/portals/include/portals/nalids.h
lustre/portals/include/portals/p30.h
lustre/portals/include/portals/ppid.h [deleted file]
lustre/portals/include/portals/ptlctl.h
lustre/portals/include/portals/types.h
lustre/portals/knals/Makefile.in
lustre/portals/knals/autoMakefile.am
lustre/portals/knals/gmnal/gmnal.h
lustre/portals/knals/gmnal/gmnal_api.c
lustre/portals/knals/gmnal/gmnal_cb.c
lustre/portals/knals/gmnal/gmnal_comm.c
lustre/portals/knals/gmnal/gmnal_module.c
lustre/portals/knals/ibnal/Makefile.in [deleted file]
lustre/portals/knals/ibnal/autoMakefile.am [deleted file]
lustre/portals/knals/ibnal/ibnal.c [deleted file]
lustre/portals/knals/ibnal/ibnal.h [deleted file]
lustre/portals/knals/ibnal/ibnal_cb.c [deleted file]
lustre/portals/knals/ibnal/ibnal_send_recv_self_testing.c [deleted file]
lustre/portals/knals/ibnal/uagent.c [deleted file]
lustre/portals/knals/iibnal/.cvsignore [moved from lustre/portals/knals/ibnal/.cvsignore with 100% similarity]
lustre/portals/knals/iibnal/Makefile.in [new file with mode: 0644]
lustre/portals/knals/iibnal/Makefile.mk [new file with mode: 0644]
lustre/portals/knals/iibnal/autoMakefile.am [new file with mode: 0644]
lustre/portals/knals/iibnal/iibnal.c [new file with mode: 0644]
lustre/portals/knals/iibnal/iibnal.h [new file with mode: 0644]
lustre/portals/knals/iibnal/iibnal_cb.c [new file with mode: 0644]
lustre/portals/knals/openibnal/.cvsignore [moved from lustre/portals/knals/scimacnal/.cvsignore with 100% similarity]
lustre/portals/knals/openibnal/openibnal.c
lustre/portals/knals/openibnal/openibnal.h
lustre/portals/knals/openibnal/openibnal_cb.c
lustre/portals/knals/qswnal/Makefile.in
lustre/portals/knals/qswnal/qswnal.c
lustre/portals/knals/qswnal/qswnal.h
lustre/portals/knals/qswnal/qswnal_cb.c
lustre/portals/knals/socknal/socknal.c
lustre/portals/knals/socknal/socknal.h
lustre/portals/knals/socknal/socknal_cb.c
lustre/portals/libcfs/.cvsignore
lustre/portals/libcfs/Makefile.in
lustre/portals/libcfs/autoMakefile.am
lustre/portals/libcfs/debug.c
lustre/portals/libcfs/lwt.c
lustre/portals/libcfs/module.c
lustre/portals/libcfs/proc.c
lustre/portals/portals/Makefile.in
lustre/portals/portals/Makefile.mk
lustre/portals/portals/api-eq.c [deleted file]
lustre/portals/portals/api-errno.c
lustre/portals/portals/api-init.c [deleted file]
lustre/portals/portals/api-me.c [deleted file]
lustre/portals/portals/api-ni.c
lustre/portals/portals/api-wrap.c
lustre/portals/portals/autoMakefile.am
lustre/portals/portals/lib-dispatch.c [deleted file]
lustre/portals/portals/lib-eq.c
lustre/portals/portals/lib-init.c
lustre/portals/portals/lib-md.c
lustre/portals/portals/lib-me.c
lustre/portals/portals/lib-move.c
lustre/portals/portals/lib-msg.c
lustre/portals/portals/lib-ni.c
lustre/portals/portals/lib-pid.c
lustre/portals/portals/module.c
lustre/portals/router/proc.c
lustre/portals/router/router.c
lustre/portals/router/router.h
lustre/portals/tests/ping_cli.c
lustre/portals/tests/ping_srv.c
lustre/portals/tests/sping_cli.c
lustre/portals/tests/sping_srv.c
lustre/portals/unals/Makefile.am
lustre/portals/unals/address.c
lustre/portals/unals/bridge.h
lustre/portals/unals/connection.c
lustre/portals/unals/dispatch.h
lustre/portals/unals/procapi.c
lustre/portals/unals/procbridge.h
lustre/portals/unals/proclib.c
lustre/portals/unals/select.c
lustre/portals/unals/tcpnal.c
lustre/portals/utils/Makefile.am
lustre/portals/utils/acceptor.c
lustre/portals/utils/debug.c
lustre/portals/utils/l_ioctl.c
lustre/portals/utils/parser.c
lustre/portals/utils/parser.h
lustre/portals/utils/portals.c
lustre/portals/utils/ptlctl.c
lustre/portals/utils/wirecheck.c
lustre/ptlrpc/Makefile.in
lustre/ptlrpc/client.c
lustre/ptlrpc/connection.c
lustre/ptlrpc/events.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/pers.c
lustre/ptlrpc/ptlrpc_internal.h
lustre/ptlrpc/service.c
lustre/utils/Lustre/lustredb.py
lustre/utils/lconf
lustre/utils/lctl.c
lustre/utils/lmc
lustre/utils/wirecheck.c

index 7cfb68e..cc851a9 100644 (file)
@@ -40,7 +40,7 @@ if !LINUX25
 DEP = dep
 dep: .depend
 
-.depend: $(LDISKFS) lvfs-sources libcfs-sources
+.depend: $(LDISKFS) lvfs-sources
        $(MAKE) $(ARCH_UM) -C $(LINUX) -f $(PWD)/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$(LINUX_CONFIG) -o scripts -o include/config/MARKER _sfdep_$(PWD) _FASTDEP_ALL_SUB_DIRS="$(PWD)"
 
 CLEANFILES = .depend
@@ -55,10 +55,7 @@ endif
 lvfs-sources:
        $(MAKE) sources -C lvfs
 
-libcfs-sources:
-       $(MAKE) sources -C portals/libcfs
-
-modules: lustre_build_version $(DEP) $(LDISKFS) lvfs-sources libcfs-sources
+modules: lustre_build_version $(DEP) $(LDISKFS) lvfs-sources
        $(MAKE) $(ARCH_UM) -C $(LINUX) -f $(PWD)/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$(LINUX_CONFIG) $(MODULE_TARGET)=$(PWD) -o tmp_include_depends -o scripts -o include/config/MARKER $@
 
 endif # MODULES
index bcf65df..ffe07b0 100644 (file)
@@ -231,8 +231,10 @@ portals/knals/Makefile
 portals/knals/autoMakefile
 portals/knals/gmnal/Makefile
 portals/knals/gmnal/autoMakefile
-portals/knals/ibnal/Makefile
-portals/knals/ibnal/autoMakefile
+portals/knals/iibnal/Makefile
+portals/knals/iibnal/autoMakefile
+portals/knals/openibnal/Makefile
+portals/knals/openibnal/autoMakefile
 portals/knals/qswnal/Makefile
 portals/knals/qswnal/autoMakefile
 portals/knals/socknal/Makefile
index af80f44..c9e1274 100644 (file)
 #define LIBLUSTRE_H__
 
 #include <sys/mman.h>
-#include <asm/byteorder.h>
-#ifndef  __CYGWIN__
-#include <stdint.h>
-#include <asm/page.h>
-#else
-#include <sys/types.h>
-#include "ioctl.h"
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_ASM_PAGE_H
+# include <asm/page.h>
 #endif
+#ifdef HAVE_SYS_USER_H
+# include <sys/user.h>
+#endif
+
+#include "ioctl.h"
+
 #include <stdio.h>
 #include <sys/ioctl.h>
 #include <stdlib.h>
@@ -116,9 +120,6 @@ static inline void *kmalloc(int size, int prot)
 #define PTR_ERR(a) ((long)(a))
 #define ERR_PTR(a) ((void*)((long)(a)))
 
-#define capable(foo) 1
-#define CAP_SYS_ADMIN 1
-
 typedef struct {
         void *cwd;
 }mm_segment_t;
@@ -130,19 +131,12 @@ struct file; /* forward ref */
 typedef int (write_proc_t)(struct file *file, const char *buffer,
                            unsigned long count, void *data);
 
-# define le16_to_cpu(x) __le16_to_cpu(x)
-# define cpu_to_le16(x) __cpu_to_le16(x)
-# define le32_to_cpu(x) __le32_to_cpu(x)
-# define cpu_to_le32(x) __cpu_to_le32(x)
-# define le64_to_cpu(x) __le64_to_cpu(x)
-# define cpu_to_le64(x) __cpu_to_le64(x)
-
 #define NIPQUAD(addr) \
         ((unsigned char *)&addr)[0], \
         ((unsigned char *)&addr)[1], \
         ((unsigned char *)&addr)[2], \
         ((unsigned char *)&addr)[3]
-                                                                                                                        
+
 #if defined(__LITTLE_ENDIAN)
 #define HIPQUAD(addr) \
         ((unsigned char *)&addr)[3], \
@@ -362,9 +356,9 @@ static inline int kmem_cache_destroy(kmem_cache_t *a)
 #define kmem_cache_alloc(cache, prio) malloc(cache->size)
 #define kmem_cache_free(cache, obj) free(obj)
 
-#define PAGE_CACHE_SIZE PAGE_SIZE
-#define PAGE_CACHE_SHIFT 12
-#define PAGE_CACHE_MASK PAGE_MASK
+#define PAGE_CACHE_SIZE  PAGE_SIZE
+#define PAGE_CACHE_SHIFT PAGE_SHIFT
+#define PAGE_CACHE_MASK  PAGE_MASK
 
 /* XXX
  * for this moment, liblusre will not rely OST for non-page-aligned write
@@ -644,7 +638,7 @@ static inline int schedule_timeout(signed long t)
                 _ret = tv.tv_sec;               \
         _ret;                                   \
 })
-#define time_after(a, b) ((long)(b) - (long)(a) > 0)
+#define time_after(a, b) ((long)(b) - (long)(a) < 0)
 #define time_before(a, b) time_after(b,a)
 
 struct timer_list {
index 7e612eb..87064fb 100644 (file)
 #include <linux/kp30.h>
 // #include <linux/obd.h>
 #include <portals/p30.h>
-#include <portals/lib-types.h>                  /* FIXME (for PTL_MD_MAX_IOV) */
 #include <linux/lustre_idl.h>
 #include <linux/lustre_ha.h>
 #include <linux/lustre_import.h>
 #include <linux/lprocfs_status.h>
 
-/* Define some large-ish defaults for MTU and MAX_IOV if portals ones
- * aren't defined (i.e. no limits) or too large */
-#if (defined(PTL_MTU) && (PTL_MTU <= (1 << 20)))
-# define PTLRPC_MTU  PTL_MTU
+/* MD flags we _always_ use */
+#define PTLRPC_MD_OPTIONS  (PTL_MD_EVENT_START_DISABLE | \
+                            PTL_MD_LUSTRE_COMPLETION_SEMANTICS)
+
+/* Define some large-ish maxima for bulk I/O 
+ * CAVEAT EMPTOR, with multinet (i.e. gateways forwarding between networks)
+ * these limits are system wide and not interface-local. */
+#define PTLRPC_MAX_BRW_SIZE     (1 << 20)
+#define PTLRPC_MAX_BRW_PAGES    512
+
+/* ...reduce to fit... */
+
+#if CRAY_PORTALS
+/* include a cray header here if relevant
+ * NB liblustre SIZE/PAGES is affected too, but it merges contiguous
+ * chunks, so FTTB, it always used contiguous MDs */
 #else
-# define PTLRPC_MTU  (1 << 20)
+# include <portals/lib-types.h>
 #endif
-#if (defined(PTL_MAX_IOV) && (PTL_MAX_IOV <= 512))
-# define PTLRPC_MAX_IOV PTL_MAX_IOV
-#else
-# define PTLRPC_MAX_IOV 512
+
+#if (defined(PTL_MTU) && (PTL_MTU < PTLRPC_MAX_BRW_SIZE))
+# undef  PTLRPC_MAX_BRW_SIZE
+# define PTLRPC_MAX_BRW_SIZE  PTL_MTU
+#endif
+#if (defined(PTL_MD_MAX_IOV) && (PTL_MD_MAX_IOV < PTLRPC_MAX_BRW_PAGES ))
+# undef  PTLRPC_MAX_BRW_PAGES
+# define PTLRPC_MAX_BRW_PAGES PTL_MD_MAX_IOV
 #endif
 
-/* Define consistent max bulk size/pages */
-#if (PTLRPC_MTU > PTLRPC_MAX_IOV * PAGE_SIZE)
-# define PTLRPC_MAX_BRW_PAGES   PTLRPC_MAX_IOV
-# define PTLRPC_MAX_BRW_SIZE   (PTLRPC_MAX_IOV * PAGE_SIZE)
+/* ...and make consistent... */
+
+#if (PTLRPC_MAX_BRW_SIZE > PTLRPC_MAX_BRW_PAGES * PAGE_SIZE)
+# undef  PTLRPC_MAX_BRW_SIZE
+# define PTLRPC_MAX_BRW_SIZE   (PTLRPC_MAX_BRW_PAGES * PAGE_SIZE)
 #else
-# define PTLRPC_MAX_BRW_PAGES  (PTLRPC_MTU / PAGE_SIZE)
-# define PTLRPC_MAX_BRW_SIZE    PTLRPC_MTU
+# undef  PTLRPC_MAX_BRW_PAGES
+# define PTLRPC_MAX_BRW_PAGES  (PTLRPC_MAX_BRW_SIZE / PAGE_SIZE)
+#endif
+
+#if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
+#error "PTLRPC_MAX_BRW_PAGES isn't a power of two"
 #endif
 
 /* Size over which to OBD_VMALLOC() rather than OBD_ALLOC() service request
 #define PTLBD_MAXREQSIZE 1024
 
 struct ptlrpc_peer {
-        ptl_nid_t         peer_nid;
+        ptl_process_id_t  peer_id;
         struct ptlrpc_ni *peer_ni;
 };
 
@@ -304,6 +324,7 @@ struct ptlrpc_request {
         struct ptlrpc_cb_id  rq_reply_cbid;
         
         struct ptlrpc_peer rq_peer; /* XXX see service.c can this be factored away? */
+        char               rq_peerstr[PTL_NALFMT_SIZE];
         struct obd_export *rq_export;
         struct obd_import *rq_import;
         
@@ -390,8 +411,8 @@ struct ptlrpc_bulk_desc {
         __u32 bd_portal;
         struct ptlrpc_request *bd_req;          /* associated request */
         wait_queue_head_t      bd_waitq;        /* server side only WQ */
-        int                    bd_page_count;   /* # pages (== entries in bd_iov) */
-        int                    bd_max_pages;    /* allocated size of bd_iov */
+        int                    bd_iov_count;    /* # entries in bd_iov */
+        int                    bd_max_iov;      /* allocated size of bd_iov */
         int                    bd_nob;          /* # bytes covered */
         int                    bd_nob_transferred; /* # bytes GOT/PUT */
 
@@ -400,10 +421,10 @@ struct ptlrpc_bulk_desc {
         struct ptlrpc_cb_id    bd_cbid;         /* network callback info */
         ptl_handle_md_t        bd_md_h;         /* associated MD */
         
-#ifdef __KERNEL__
-        ptl_kiov_t bd_iov[PTL_MD_MAX_IOV];
+#if (!CRAY_PORTALS && defined(__KERNEL__))
+        ptl_kiov_t             bd_iov[0];
 #else
-        struct iovec bd_iov[PTL_MD_MAX_IOV];
+        ptl_md_iovec_t         bd_iov[0];
 #endif
 };
 
@@ -484,6 +505,18 @@ struct ptlrpc_service {
         struct ptlrpc_srv_ni srv_interfaces[0];
 };
 
+static inline char *ptlrpc_peernid2str(struct ptlrpc_peer *p, char *str)
+{
+        LASSERT(p->peer_ni != NULL);
+        return (portals_nid2str(p->peer_ni->pni_number, p->peer_id.nid, str));
+}
+
+static inline char *ptlrpc_id2str(struct ptlrpc_peer *p, char *str)
+{
+        LASSERT(p->peer_ni != NULL);
+        return (portals_id2str(p->peer_ni->pni_number, p->peer_id, str));
+}
+
 /* ptlrpc/events.c */
 extern struct ptlrpc_ni ptlrpc_interfaces[];
 extern int              ptlrpc_ninterfaces;
@@ -494,6 +527,7 @@ extern void client_bulk_callback (ptl_event_t *ev);
 extern void request_in_callback(ptl_event_t *ev);
 extern void reply_out_callback(ptl_event_t *ev);
 extern void server_bulk_callback (ptl_event_t *ev);
+extern int ptlrpc_default_nal(void);
 
 /* ptlrpc/connection.c */
 void ptlrpc_dump_connections(void);
@@ -504,6 +538,7 @@ int ptlrpc_put_connection(struct ptlrpc_connection *c);
 struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
 void ptlrpc_init_connection(void);
 void ptlrpc_cleanup_connection(void);
+extern ptl_pid_t ptl_get_pid(void);
 
 /* ptlrpc/niobuf.c */
 int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc);
index 6a3786e..e858012 100644 (file)
@@ -996,7 +996,7 @@ typedef __u8 class_uuid_t[16];
 void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
 
 /* lustre_peer.c    */
-int lustre_uuid_to_peer(char *uuid, struct lustre_peer *peer);
+int lustre_uuid_to_peer(char *uuid, __u32 *peer_nal, ptl_nid_t *peer_nid);
 int class_add_uuid(char *uuid, __u64 nid, __u32 nal);
 int class_del_uuid (char *uuid);
 void class_init_uuidlist(void);
index 836bd34..747f0c7 100644 (file)
@@ -410,8 +410,19 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         obd_str2uuid (&cluuid, str);
 
         /* XXX extract a nettype and format accordingly */
-        snprintf(remote_uuid.uuid, sizeof remote_uuid,
-                 "NET_"LPX64"_UUID", req->rq_peer.peer_nid);
+        switch (sizeof(ptl_nid_t)) {
+                /* NB the casts only avoid compiler warnings */
+        case 8:
+                snprintf(remote_uuid.uuid, sizeof remote_uuid,
+                         "NET_"LPX64"_UUID", (__u64)req->rq_peer.peer_id.nid);
+                break;
+        case 4:
+                snprintf(remote_uuid.uuid, sizeof remote_uuid,
+                         "NET_%x_UUID", (__u32)req->rq_peer.peer_id.nid);
+                break;
+        default:
+                LBUG();
+        }
 
         spin_lock_bh(&target->obd_processing_task_lock);
         abort_recovery = target->obd_abort_recovery;
index 80f5bab..86550e9 100644 (file)
@@ -1146,20 +1146,16 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock, int pos)
         if (lock->l_conn_export != NULL)
                 obd = lock->l_conn_export->exp_obd;
         if (lock->l_export && lock->l_export->exp_connection) {
-                CDEBUG(level, "  Node: NID "LPX64" (%s) on %s (rhandle: "LPX64")\n",
-                       lock->l_export->exp_connection->c_peer.peer_nid,
-                       portals_nid2str(lock->l_export->exp_connection->c_peer.peer_ni->pni_number,
-                                       lock->l_export->exp_connection->c_peer.peer_nid, str),
+                CDEBUG(level, "  Node: NID %s on %s (rhandle: "LPX64")\n",
+                       ptlrpc_peernid2str(&lock->l_export->exp_connection->c_peer, str),
                        lock->l_export->exp_connection->c_peer.peer_ni->pni_name,
                        lock->l_remote_handle.cookie);
         } else if (obd == NULL) {
                 CDEBUG(level, "  Node: local\n");
         } else {
                 struct obd_import *imp = obd->u.cli.cl_import;
-                CDEBUG(level, "  Node: NID "LPX64" (%s) on %s (rhandle: "LPX64")\n",
-                       imp->imp_connection->c_peer.peer_nid,
-                       portals_nid2str(imp->imp_connection->c_peer.peer_ni->pni_number,
-                                       imp->imp_connection->c_peer.peer_nid, str),
+                CDEBUG(level, "  Node: NID %s on %s (rhandle: "LPX64")\n",
+                       ptlrpc_peernid2str(&imp->imp_connection->c_peer, str),
                        imp->imp_connection->c_peer.peer_ni->pni_name,
                        lock->l_remote_handle.cookie);
         }
index 3edfe7a..9446bfa 100644 (file)
@@ -182,13 +182,10 @@ static void waiting_locks_callback(unsigned long unused)
                         break;
 
                 LDLM_ERROR(lock, "lock callback timer expired: evicting client "
-                           "%s@%s nid "LPX64" (%s) ",
+                           "%s@%s nid %s ",
                            lock->l_export->exp_client_uuid.uuid,
                            lock->l_export->exp_connection->c_remote_uuid.uuid,
-                           lock->l_export->exp_connection->c_peer.peer_nid,
-                           portals_nid2str(lock->l_export->exp_connection->c_peer.peer_ni->pni_number,
-                                           lock->l_export->exp_connection->c_peer.peer_nid,
-                                           str));
+                           ptlrpc_peernid2str(&lock->l_export->exp_connection->c_peer, str));
 
                 spin_lock_bh(&expired_lock_thread.elt_lock);
                 list_del(&lock->l_pending_chain);
@@ -307,14 +304,14 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
 
 static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,const char *ast_type)
 {
-        const struct ptlrpc_connection *conn = lock->l_export->exp_connection;
+        struct ptlrpc_connection *conn = lock->l_export->exp_connection;
         char str[PTL_NALFMT_SIZE];
 
         LDLM_ERROR(lock, "%s AST failed (%d): evicting client %s@%s NID "LPX64
                    " (%s)", ast_type, rc, lock->l_export->exp_client_uuid.uuid,
-                   conn->c_remote_uuid.uuid, conn->c_peer.peer_nid,
-                   portals_nid2str(conn->c_peer.peer_ni->pni_number,
-                                   conn->c_peer.peer_nid, str));
+                   conn->c_remote_uuid.uuid, conn->c_peer.peer_id.nid,
+                   ptlrpc_peernid2str(&conn->c_peer, str));
+
         ptlrpc_fail_export(lock->l_export);
 }
 
@@ -322,12 +319,15 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
                                  struct ptlrpc_request *req, int rc,
                                  const char *ast_type)
 {
+        struct ptlrpc_peer *peer = &req->rq_import->imp_connection->c_peer;
+        char str[PTL_NALFMT_SIZE];
+
         if (rc == -ETIMEDOUT || rc == -EINTR || rc == -ENOTCONN) {
                 LASSERT(lock->l_export);
                 if (lock->l_export->exp_libclient) {
-                        LDLM_DEBUG(lock, "%s AST to liblustre client (nid "
-                                   LPU64") timeout, just cancelling lock",
-                                   ast_type, req->rq_peer.peer_nid);
+                        LDLM_DEBUG(lock, "%s AST to liblustre client (nid %s)"
+                                   " timeout, just cancelling lock", ast_type,
+                                   ptlrpc_peernid2str(peer, str));
                         ldlm_lock_cancel(lock);
                         rc = -ERESTART;
                 } else {
@@ -336,13 +336,13 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
                 }
         } else if (rc) {
                 if (rc == -EINVAL)
-                        LDLM_DEBUG(lock, "client (nid "LPU64") returned %d"
+                        LDLM_DEBUG(lock, "client (nid %s) returned %d"
                                    " from %s AST - normal race",
-                                   req->rq_peer.peer_nid,
+                                   ptlrpc_peernid2str(peer, str),
                                    req->rq_repmsg->status, ast_type);
                 else
-                        LDLM_ERROR(lock, "client (nid "LPU64") returned %d "
-                                   "from %s AST", req->rq_peer.peer_nid,
+                        LDLM_ERROR(lock, "client (nid %s) returned %d "
+                                   "from %s AST", ptlrpc_peernid2str(peer, str),
                                    (req->rq_repmsg != NULL) ?
                                    req->rq_repmsg->status : 0, ast_type);
                 ldlm_lock_cancel(lock);
@@ -771,7 +771,6 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
         struct ldlm_request *dlm_req;
         struct ldlm_lock *lock;
         struct ldlm_resource *res;
-        char str[PTL_NALFMT_SIZE];
         int rc;
         ENTRY;
 
@@ -791,12 +790,10 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
         lock = ldlm_handle2lock(&dlm_req->lock_handle1);
         if (!lock) {
                 CERROR("received cancel for unknown lock cookie "LPX64
-                       " from client %s nid "LPX64" (%s)\n",
+                       " from client %s id %s\n",
                        dlm_req->lock_handle1.cookie,
                        req->rq_export->exp_client_uuid.uuid,
-                       req->rq_peer.peer_nid,
-                       portals_nid2str(req->rq_peer.peer_ni->pni_number,
-                                       req->rq_peer.peer_nid, str));
+                       req->rq_peerstr);
                 LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock "
                                   "(cookie "LPU64")",
                                   dlm_req->lock_handle1.cookie);
@@ -1009,7 +1006,6 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
         struct ldlm_namespace *ns;
         struct ldlm_request *dlm_req;
         struct ldlm_lock *lock;
-        char str[PTL_NALFMT_SIZE];
         int rc;
         ENTRY;
 
@@ -1021,14 +1017,12 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
         if (req->rq_export == NULL) {
                 struct ldlm_request *dlm_req;
 
-                CDEBUG(D_RPCTRACE, "operation %d from nid "LPX64" (%s) with bad "
-                       "export cookie "LPX64" (ptl req %d/rep %d); this is "
+                CDEBUG(D_RPCTRACE, "operation %d from %s with bad "
+                       "export cookie "LPX64"; this is "
                        "normal if this node rebooted with a lock held\n",
-                       req->rq_reqmsg->opc, req->rq_peer.peer_nid,
-                       portals_nid2str(req->rq_peer.peer_ni->pni_number,
-                                       req->rq_peer.peer_nid, str),
-                       req->rq_reqmsg->handle.cookie,
-                       req->rq_request_portal, req->rq_reply_portal);
+                       req->rq_reqmsg->opc,
+                       req->rq_peerstr,
+                       req->rq_reqmsg->handle.cookie);
 
                 dlm_req = lustre_swab_reqbuf(req, 0, sizeof (*dlm_req),
                                              lustre_swab_ldlm_request);
@@ -1150,9 +1144,9 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req)
 
         if (req->rq_export == NULL) {
                 struct ldlm_request *dlm_req;
-                CERROR("operation %d with bad export (ptl req %d/rep %d)\n",
-                       req->rq_reqmsg->opc, req->rq_request_portal,
-                       req->rq_reply_portal);
+                CERROR("operation %d with bad export from %s\n",
+                       req->rq_reqmsg->opc,
+                       req->rq_peerstr);
                 CERROR("--> export cookie: "LPX64"\n",
                        req->rq_reqmsg->handle.cookie);
                 dlm_req = lustre_swab_reqbuf(req, 0, sizeof (*dlm_req),
index 428338d..9b241f2 100644 (file)
@@ -554,9 +554,11 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
                 rc = ptlrpc_queue_wait(req);
 
                 if (rc == ESTALE) {
-                        CERROR("client/server (nid "LPU64") out of sync--not "
-                               "fatal\n",
-                               req->rq_import->imp_connection->c_peer.peer_nid);
+                        char str[PTL_NALFMT_SIZE];
+                        CERROR("client/server (nid %s) out of sync"
+                               " -- not fatal\n",
+                               ptlrpc_peernid2str(&req->rq_import->
+                                                  imp_connection->c_peer, str));
                 } else if (rc == -ETIMEDOUT) {
                         ptlrpc_req_finished(req);
                         GOTO(restart, rc);
index 57a5a2d..c6e54b4 100644 (file)
@@ -392,20 +392,20 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
                 PCFG_INIT(pcfg, NAL_CMD_REGISTER_MYNID);
                 pcfg.pcfg_nal = lmd->lmd_nal;
                 pcfg.pcfg_nid = lmd->lmd_local_nid;
-                err = kportal_nal_cmd(&pcfg);
+                err = libcfs_nal_cmd(&pcfg);
                 if (err <0)
                         GOTO(out, err);
         }
 
-        if (lmd->lmd_nal == SOCKNAL) {
-                PCFG_INIT(pcfg, NAL_CMD_ADD_AUTOCONN);
+        if (lmd->lmd_nal == SOCKNAL ||
+            lmd->lmd_nal == OPENIBNAL ||
+            lmd->lmd_nal == IIBNAL) {
+                PCFG_INIT(pcfg, NAL_CMD_ADD_PEER);
                 pcfg.pcfg_nal     = lmd->lmd_nal;
                 pcfg.pcfg_nid     = lmd->lmd_server_nid;
                 pcfg.pcfg_id      = lmd->lmd_server_ipaddr;
                 pcfg.pcfg_misc    = lmd->lmd_port;
-                pcfg.pcfg_size    = 8388608;
-                pcfg.pcfg_flags   = 0x4; /*share*/
-                err = kportal_nal_cmd(&pcfg);
+                err = libcfs_nal_cmd(&pcfg);
                 if (err <0)
                         GOTO(out, err);
         }
@@ -490,13 +490,14 @@ out_del_uuid:
         err = class_process_config(&lcfg);
 
 out_del_conn:
-        if (lmd->lmd_nal == SOCKNAL) {
-                PCFG_INIT(pcfg, NAL_CMD_DEL_AUTOCONN);
+        if (lmd->lmd_nal == SOCKNAL ||
+            lmd->lmd_nal == OPENIBNAL ||
+            lmd->lmd_nal == IIBNAL) {
+                PCFG_INIT(pcfg, NAL_CMD_DEL_PEER);
                 pcfg.pcfg_nal     = lmd->lmd_nal;
                 pcfg.pcfg_nid     = lmd->lmd_server_nid;
-                pcfg.pcfg_id      = lmd->lmd_server_ipaddr;
-                pcfg.pcfg_flags   = 1; /*share*/
-                err = kportal_nal_cmd(&pcfg);
+                pcfg.pcfg_flags   = 1;          /* single_share */
+                err = libcfs_nal_cmd(&pcfg);
                 if (err <0)
                         GOTO(out, err);
         }
index 7ab83fc..4730c58 100644 (file)
@@ -278,6 +278,7 @@ void mds_steal_ack_locks(struct ptlrpc_request *req)
         struct ptlrpc_reply_state *oldrep;
         struct ptlrpc_service     *svc;
         unsigned long              flags;
+        char                       str[PTL_NALFMT_SIZE];
         int                        i;
 
         /* CAVEAT EMPTOR: spinlock order */
@@ -299,10 +300,10 @@ void mds_steal_ack_locks(struct ptlrpc_request *req)
                 list_del_init (&oldrep->rs_exp_list);
 
                 CWARN("Stealing %d locks from rs %p x"LPD64".t"LPD64
-                      " o%d NID"LPX64"\n",
+                      " o%d NID %s\n",
                       oldrep->rs_nlocks, oldrep,
                       oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg.opc,
-                      exp->exp_connection->c_peer.peer_nid);
+                      ptlrpc_peernid2str(&exp->exp_connection->c_peer, str));
 
                 for (i = 0; i < oldrep->rs_nlocks; i++)
                         ptlrpc_save_lock(req,
index 9992c02..e61023c 100644 (file)
@@ -285,10 +285,11 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
 
 
         case OBD_IOC_CLOSE_UUID: {
-                struct lustre_peer peer;
+                ptl_nid_t       peer_nid;
+                __u32           peer_nal;
                 CDEBUG(D_IOCTL, "closing all connections to uuid %s\n",
                        data->ioc_inlbuf1);
-                lustre_uuid_to_peer(data->ioc_inlbuf1, &peer);
+                lustre_uuid_to_peer(data->ioc_inlbuf1, &peer_nal, &peer_nid);
                 GOTO(out, err = 0);
         }
 
@@ -438,7 +439,8 @@ int obd_proc_read_version(char *page, char **start, off_t off, int count,
         return snprintf(page, count, "%s\n", BUILD_VERSION);
 }
 
-int obd_proc_read_kernel_version(char *page, char **start, off_t off, int count,                                 int *eof, void *data)
+int obd_proc_read_kernel_version(char *page, char **start, off_t off, int count,
+                                 int *eof, void *data)
 {
         *eof = 1;
         return snprintf(page, count, "%u\n", LUSTRE_KERNEL_VERSION);
index 3f172fe..4489671 100644 (file)
@@ -42,7 +42,6 @@ struct uuid_nid_data {
         ptl_nid_t nid;
         char *uuid;
         __u32 nal;
-        ptl_handle_ni_t ni;
 };
 
 /* FIXME: This should probably become more elegant than a global linked list */
@@ -61,7 +60,7 @@ void class_exit_uuidlist(void)
         class_del_uuid(NULL);
 }
 
-int lustre_uuid_to_peer(char *uuid, struct lustre_peer *peer)
+int lustre_uuid_to_peer(char *uuid, __u32 *peer_nal, ptl_nid_t *peer_nid)
 {
         struct list_head *tmp;
 
@@ -72,8 +71,8 @@ int lustre_uuid_to_peer(char *uuid, struct lustre_peer *peer)
                         list_entry(tmp, struct uuid_nid_data, head);
 
                 if (strcmp(data->uuid, uuid) == 0) {
-                        peer->peer_nid = data->nid;
-                        peer->peer_ni = data->ni;
+                        *peer_nid = data->nid;
+                        *peer_nal = data->nal;
 
                         spin_unlock (&g_uuid_lock);
                         return 0;
@@ -86,7 +85,6 @@ int lustre_uuid_to_peer(char *uuid, struct lustre_peer *peer)
 
 int class_add_uuid(char *uuid, __u64 nid, __u32 nal)
 {
-        const ptl_handle_ni_t *nip;
         struct uuid_nid_data *data;
         int rc;
         int nob = strnlen (uuid, PAGE_SIZE) + 1;
@@ -94,26 +92,21 @@ int class_add_uuid(char *uuid, __u64 nid, __u32 nal)
         if (nob > PAGE_SIZE)
                 return -EINVAL;
 
-        nip = kportal_get_ni (nal);
-        if (nip == NULL) {
-                CERROR("get_ni failed: is the NAL module loaded?\n");
-                return -EIO;
-        }
-
         rc = -ENOMEM;
         OBD_ALLOC(data, sizeof(*data));
         if (data == NULL)
-                goto fail_0;
+                return -ENOMEM;
 
         OBD_ALLOC(data->uuid, nob);
-        if (data == NULL)
-                goto fail_1;
+        if (data == NULL) {
+                OBD_FREE(data, sizeof(*data));
+                return -ENOMEM;
+        }
 
         CDEBUG(D_INFO, "add uuid %s "LPX64" %u\n", uuid, nid, nal);
         memcpy(data->uuid, uuid, nob);
         data->nid = nid;
         data->nal = nal;
-        data->ni  = *nip;
 
         spin_lock (&g_uuid_lock);
 
@@ -122,12 +115,6 @@ int class_add_uuid(char *uuid, __u64 nid, __u32 nal)
         spin_unlock (&g_uuid_lock);
 
         return 0;
-
- fail_1:
-        OBD_FREE (data, sizeof (*data));
- fail_0:
-        kportal_put_ni (nal);
-        return (rc);
 }
 
 /* delete only one entry if uuid is specified, otherwise delete all */
@@ -166,7 +153,6 @@ int class_del_uuid (char *uuid)
 
                 list_del (&data->head);
 
-                kportal_put_ni (data->nal);
                 OBD_FREE(data->uuid, strlen(data->uuid) + 1);
                 OBD_FREE(data, sizeof(*data));
         } while (!list_empty (&deathrow));
index b36cac8..afed455 100644 (file)
@@ -587,7 +587,7 @@ static int class_config_llog_handler(struct llog_handle * handle,
                         pcfg->pcfg_nid = cfg->cfg_local_nid;
                 }
 
-                rc = kportal_nal_cmd(pcfg);
+                rc = libcfs_nal_cmd(pcfg);
         }
 out:
         RETURN(rc);
index 22a7d63..69449fe 100644 (file)
@@ -345,7 +345,7 @@ obd_count ost_checksum_bulk(struct ptlrpc_bulk_desc *desc)
         obd_count cksum = 0;
         int i;
 
-        for (i = 0; i < desc->bd_page_count; i++) {
+        for (i = 0; i < desc->bd_iov_count; i++) {
                 struct page *page = desc->bd_iov[i].kiov_page;
                 char *ptr = kmap(page);
                 int psum, off = desc->bd_iov[i].kiov_offset & ~PAGE_MASK;
@@ -377,7 +377,6 @@ static int ost_brw_read(struct ptlrpc_request *req)
         struct ost_body         *body, *repbody;
         struct l_wait_info       lwi;
         struct obd_trans_info    oti = { 0 };
-        char                     str[PTL_NALFMT_SIZE];
         int                      size[1] = { sizeof(*body) };
         int                      comms_error = 0;
         int                      niocount;
@@ -529,23 +528,17 @@ static int ost_brw_read(struct ptlrpc_request *req)
         } else {
                 if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) {
                         CERROR("bulk IO comms error: "
-                               "evicting %s@%s nid "LPX64" (%s)\n",
+                               "evicting %s@%s id %s\n",
                                req->rq_export->exp_client_uuid.uuid,
                                req->rq_export->exp_connection->c_remote_uuid.uuid,
-                               req->rq_peer.peer_nid,
-                               portals_nid2str(req->rq_peer.peer_ni->pni_number,
-                                               req->rq_peer.peer_nid,
-                                               str));
+                               req->rq_peerstr);
                         ptlrpc_fail_export(req->rq_export);
                 } else {
                         CERROR("ignoring bulk IO comms error: "
-                               "client reconnected %s@%s nid "LPX64" (%s)\n",  
+                               "client reconnected %s@%s id %s\n",  
                                req->rq_export->exp_client_uuid.uuid,
                                req->rq_export->exp_connection->c_remote_uuid.uuid,
-                               req->rq_peer.peer_nid,
-                               portals_nid2str(req->rq_peer.peer_ni->pni_number,
-                                               req->rq_peer.peer_nid,
-                                               str));
+                               req->rq_peerstr);
                 }
         }
 
@@ -566,7 +559,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         int                      objcount, niocount, npages;
         int                      comms_error = 0;
         int                      rc, swab, i, j;
-        char                     str[PTL_NALFMT_SIZE];
+        struct timeval           start;
         ENTRY;
 
         if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK))
@@ -678,20 +671,19 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 obd_count client_cksum = body->oa.o_cksum;
                 obd_count cksum = ost_checksum_bulk(desc);
 
-                portals_nid2str(req->rq_peer.peer_ni->pni_number,
-                                req->rq_peer.peer_nid, str);
                 if (client_cksum != cksum) {
-                        CERROR("Bad checksum: client %x, server %x, client NID "
-                               LPX64" (%s)\n", client_cksum, cksum,
-                               req->rq_peer.peer_nid, str);
+                        CERROR("Bad checksum: client %x, server %x id %s\n",
+                               client_cksum, cksum,
+                               req->rq_peerstr);
                         cksum_counter = 1;
                         repbody->oa.o_cksum = cksum;
                 } else {
                         cksum_counter++;
                         if ((cksum_counter & (-cksum_counter)) == cksum_counter)
-                                CWARN("Checksum %u from "LPX64" (%s): %x OK\n",
-                                      cksum_counter, req->rq_peer.peer_nid,
-                                      str, cksum);
+                                CWARN("Checksum %u from %s: %x OK\n",
+                                      cksum_counter,
+                                      req->rq_peerstr,
+                                      cksum);
                 }
         }
 #endif
@@ -733,25 +725,19 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 ptlrpc_error(req);
         } else {
                 if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) {
-                        CERROR("bulk IO comms error: "
-                               "evicting %s@%s nid "LPX64" (%s)\n",
+                        CERROR("%s: bulk IO comm error evicting %s@%s id %s\n",
+                               req->rq_export->exp_obd->obd_name,
                                req->rq_export->exp_client_uuid.uuid,
                                req->rq_export->exp_connection->c_remote_uuid.uuid,
-                               req->rq_peer.peer_nid,
-                               portals_nid2str(req->rq_peer.peer_ni->pni_number,
-                                               req->rq_peer.peer_nid,
-                                               str));
+                               req->rq_peerstr);
                         ptlrpc_fail_export(req->rq_export);
                 } else {
                         CERROR("ignoring bulk IO comms error: "
-                               "client reconnected %s@%s nid "LPX64" (%s)\n",
+                               "client reconnected %s@%s id %s\n",
                                req->rq_export->exp_client_uuid.uuid,
                                req->rq_export->exp_connection->c_remote_uuid.uuid,
-                               req->rq_peer.peer_nid,
-                               portals_nid2str(req->rq_peer.peer_ni->pni_number,
-                                               req->rq_peer.peer_nid,
-                                               str));
-                }        
+                               req->rq_peerstr);
+                }
         }
         RETURN(rc);
 }
index 27704bd..021fa68 100644 (file)
@@ -14,16 +14,108 @@ AC_MSG_RESULT([$enable_inkernel])
 AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes)
 
 # -------- are we building against an external portals? -------
-AC_MSG_CHECKING([if Cray portals should be used])
+AC_MSG_CHECKING([for Cray portals])
 AC_ARG_WITH([cray-portals],
        AC_HELP_STRING([--with-cray-portals=path],
                       [path to cray portals]),
        [
-               CRAY_PORTALS_INCLUDE="-I$with_cray_portals"
-               AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals])
+               if test "$with_cray_portals" != no; then
+                       CRAY_PORTALS_PATH=$with_cray_portals
+                       CRAY_PORTALS_INCLUDES="$with_cray_portals/include"
+                       CRAY_PORTALS_LIBS="$with_cray_portals"
+                fi
        ],[with_cray_portals=no])
+AC_SUBST(CRAY_PORTALS_PATH)
+AC_MSG_RESULT([$CRAY_PORTALS_PATH])
+
+AC_MSG_CHECKING([for Cray portals includes])
+AC_ARG_WITH([cray-portals-includes],
+       AC_HELP_STRING([--with-cray-portals-includes=path],
+                      [path to cray portals includes]),
+       [
+               if test "$with_cray_portals_includes" != no; then
+                       CRAY_PORTALS_INCLUDES="$with_cray_portals_includes"
+                fi
+       ])
+AC_SUBST(CRAY_PORTALS_INCLUDES)
+AC_MSG_RESULT([$CRAY_PORTALS_INCLUDES])
+
+AC_MSG_CHECKING([for Cray portals libs])
+AC_ARG_WITH([cray-portals-libs],
+       AC_HELP_STRING([--with-cray-portals-libs=path],
+                      [path to cray portals libs]),
+       [
+               if test "$with_cray_portals_libs" != no; then
+                       CRAY_PORTALS_LIBS="$with_cray_portals_libs"
+                fi
+       ])
+AC_SUBST(CRAY_PORTALS_LIBS)
+AC_MSG_RESULT([$CRAY_PORTALS_LIBS])
+
+if test x$CRAY_PORTALS_INCLUDES != x ; then
+       if test ! -r $CRAY_PORTALS_INCLUDES/portals/api.h ; then
+               AC_MSG_ERROR([Cray portals headers were not found in $CRAY_PORTALS_INCLUDES.  Please check the paths passed to --with-cray-portals or --with-cray-portals-includes.])
+       fi
+fi
+if test x$CRAY_PORTALS_LIBS != x ; then
+       if test ! -r $CRAY_PORTALS_LIBS/libportals.a ; then
+               AC_MSG_ERROR([Cray portals libraries were not found in $CRAY_PORTALS_LIBS.  Please check the paths passed to --with-cray-portals or --with-cray-portals-libs.])
+       fi
+fi
+
+AC_MSG_CHECKING([whether to use Cray portals])
+if test x$CRAY_PORTALS_INCLUDES != x -a x$CRAY_PORTALS_LIBS != x ; then
+       with_cray_portals=yes
+       AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals])
+       CRAY_PORTALS_INCLUDES="-I$CRAY_PORTALS_INCLUDES"
+else
+       with_cray_portals=no
+fi
 AC_MSG_RESULT([$with_cray_portals])
 AM_CONDITIONAL(CRAY_PORTALS, test x$with_cray_portals != xno)
+
+# ----------------------------------------
+# some tests for catamount-like systems
+# ----------------------------------------
+AC_ARG_ENABLE([sysio_init],
+       AC_HELP_STRING([--disable-sysio-init],
+               [call sysio init functions when initializing liblustre]),
+       [],[enable_sysio_init=yes])
+AC_MSG_CHECKING([whether to initialize libsysio])
+AC_MSG_RESULT([$enable_sysio_init])
+if test x$enable_sysio_init != xno ; then
+       AC_DEFINE([INIT_SYSIO], 1, [call sysio init functions])
+fi
+
+AC_ARG_ENABLE([urandom],
+       AC_HELP_STRING([--disable-urandom],
+               [disable use of /dev/urandom for liblustre]),
+       [],[enable_urandom=yes])
+AC_MSG_CHECKING([whether to use /dev/urandom for liblustre])
+AC_MSG_RESULT([$enable_urandom])
+if test x$enable_urandom != xno ; then
+       AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data])
+fi
+
+# -------- check for -lcap and -lpthread ----
+if test x$enable_liblustre = xyes ; then
+       AC_CHECK_LIB([cap], [cap_get_proc],
+               [
+                       CAP_LIBS="-lcap"
+                       AC_DEFINE([HAVE_LIBCAP], 1, [use libcap])
+               ],
+               [CAP_LIBS=""])
+       AC_SUBST(CAP_LIBS)
+       AC_CHECK_LIB([pthread], [pthread_create],
+               [
+                       PTHREAD_LIBS="-lpthread"
+                       AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread])
+               ],
+               [PTHREAD_LIBS=""])
+       AC_SUBST(PTHREAD_LIBS)
+fi
+
+# -------- enable tests and utils? -------
 if test x$enable_tests = xno ; then
        AC_MSG_NOTICE([disabling tests])
        enable_tests=no
@@ -117,7 +209,7 @@ AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno)
 
 # -------  Makeflags ------------------
 
-CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
+CPPFLAGS="$CPPFLAGS $CRAY_PORTALS_INCLUDES -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
 
 # liblustre are all the same
 LLCPPFLAGS="-D__arch_lib__ -D_LARGEFILE64_SOURCE=1"
@@ -135,7 +227,7 @@ if test x$enable_ldiskfs = xyes ; then
        AC_DEFINE(CONFIG_LDISKFS_FS_SECURITY, 1, [enable fs security])
 fi
 
-EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I$PWD/portals/include -I$PWD/include"
+EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDES -I$PWD/portals/include -I$PWD/include"
 
 # these are like AC_TRY_COMPILE, but try to build modules against the
 # kernel, inside the kernel-tests directory
@@ -330,7 +422,11 @@ if test x$enable_modules != xno ; then
                        QSWCPPFLAGS="-DMULTIRAIL_EKC=1"
                else
                        AC_MSG_RESULT([not supported])
-                       QSWCPPFLAGS="-I$LINUX/drivers/net/qsnet/include"
+                       if test -d $LINUX/drivers/net/qsnet/include; then
+                               QSWCPPFLAGS="-I$LINUX/drivers/net/qsnet/include"
+                       else
+                               QSWCPPFLAGS="-I$LINUX/include/linux"
+                       fi
                fi
        else
                AC_MSG_RESULT([no])
@@ -370,39 +466,57 @@ if test x$enable_modules != xno ; then
        AC_SUBST(GMCPPFLAGS)
        AC_SUBST(GMNAL)
 
-       #fixme: where are the default IB includes?
-       default_ib_include_dir=/usr/local/ib/include
-       an_ib_include_file=vapi.h
-
-       AC_MSG_CHECKING([if ib nal support was requested])
-       AC_ARG_WITH([ib],
-               AC_HELP_STRING([--with-ib=yes/no/path],
-                              [Path to IB includes]),
+       #### OpenIB 
+       AC_MSG_CHECKING([if OpenIB kernel headers are present])
+       OPENIBCPPFLAGS="-I$LINUX/drivers/infiniband/include -DIN_TREE_BUILD"
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS $OPENIBCPPFLAGS"
+       LUSTRE_MODULE_TRY_COMPILE(
                [
-                       case $with_ib in
-                               yes)
-                                       AC_MSG_RESULT([yes])
-                                       IBCPPFLAGS="-I/usr/local/ib/include"
-                                       IBNAL="ibnal"
-                                       ;;
-                               no)
-                                       AC_MSG_RESULT([no])
-                                       IBCPPFLAGS=""
-                                       IBNAL=""
-                                       ;;
-                               *)
-                                       AC_MSG_RESULT([yes])
-                                       IBCPPFLAGS="-I$with_ib"
-                                       IBNAL=""
-                                       ;;
-                       esac
+                       #include <ts_ib_core.h>
+               ],[
+                       struct ib_device_properties props;
+                       return 0;
+               ],[
+                       AC_MSG_RESULT([yes])
+                       OPENIBNAL="openibnal"
                ],[
                        AC_MSG_RESULT([no])
-                       IBFLAGS=""
-                       IBNAL=""
+                       OPENIBNAL=""
+                       OPENIBCPPFLAGS=""
                ])
-       AC_SUBST(IBNAL)
-       AC_SUBST(IBCPPFLAGS)
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+       AC_SUBST(OPENIBCPPFLAGS)
+       AC_SUBST(OPENIBNAL)
+
+       #### Infinicon IB
+       AC_MSG_CHECKING([if Infinicon IB kernel headers are present])
+       # for how the only infinicon ib build has headers in /usr/include/iba
+       IIBCPPFLAGS="-I/usr/include -DIN_TREE_BUILD"
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS $IIBCPPFLAGS"
+       LUSTRE_MODULE_TRY_COMPILE(
+               [
+                       #include <linux/iba/ibt.h>
+               ],[
+                       IBT_INTERFACE_UNION interfaces;
+                       FSTATUS             rc;
+
+                       rc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2,
+                                                     &interfaces);
+
+                       return rc == FSUCCESS ? 0 : 1;
+               ],[
+                       AC_MSG_RESULT([yes])
+                       IIBNAL="iibnal"
+               ],[
+                       AC_MSG_RESULT([no])
+                       IIBNAL=""
+                       IIBCPPFLAGS=""
+               ])
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+       AC_SUBST(IIBCPPFLAGS)
+       AC_SUBST(IIBNAL)
 
        # ---------- Red Hat 2.4.18 has iobuf->dovary --------------
        # But other kernels don't
@@ -419,7 +533,7 @@ if test x$enable_modules != xno ; then
                        AC_DEFINE(HAVE_KIOBUF_DOVARY, 1, [struct kiobuf has a dovary field])
                ],[
                        AC_MSG_RESULT([no])
-               ])
+               ])      
 
        # ----------- 2.6.4 no longer has page->list ---------------
        AC_MSG_CHECKING([if struct page has a list field])
@@ -468,6 +582,16 @@ if test x$enable_modules != xno ; then
                        AC_MSG_RESULT([no])
                ])
 
+       # --------- zap_page_range(vma) --------------------------------
+       AC_MSG_CHECKING([if zap_pag_range with vma parameter])
+       ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`"
+       if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then
+               AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter])
+               AC_MSG_RESULT([yes])
+       else
+               AC_MSG_RESULT([no])
+       fi
+
        # ---------- Red Hat 2.4.21 backports some more 2.5 bits --------
 
        AC_MSG_CHECKING([if kernel defines PDE])
@@ -500,7 +624,6 @@ if test x$enable_modules != xno ; then
                ],[
                        AC_MSG_RESULT([no])
                ])
-
        AC_MSG_CHECKING([if kernel defines cpumask_t])
        LUSTRE_MODULE_TRY_COMPILE(
                [
@@ -549,6 +672,7 @@ if test x$enable_modules != xno ; then
                        AC_MSG_RESULT([no])
                ])
 
+
        # ---------- modules? ------------------------
        AC_MSG_CHECKING([for module support])
        LUSTRE_MODULE_TRY_COMPILE(
@@ -650,18 +774,37 @@ if test x$enable_modules != xno ; then
        esac # $BACKINGFS
 fi
 
-AM_CONDITIONAL(BUILD_IBNAL, test x$IBNAL = "xibnal")
-AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal")
 AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal")
+AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal")
+AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal")
+AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal")
+
+# portals/utils/portals.c
+AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h])
+AC_CHECK_FUNCS([gethostbyname socket connect])
+
+# portals/utils/debug.c
+AC_CHECK_HEADERS([linux/version.h])
+
+# include/liblustre.h
+AC_CHECK_HEADERS([asm/page.h sys/user.h stdint.h])
+
+# liblustre/llite_lib.h
+AC_CHECK_HEADERS([xtio.h file.h])
+
+# liblustre/dir.c
+AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h])
+
+# liblustre/lutil.c
+AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h])
+AC_CHECK_FUNCS([inet_ntoa])
 
 CPPFLAGS="-include \$(top_builddir)/include/config.h $CPPFLAGS"
 EXTRA_KCFLAGS="-include $PWD/include/config.h $EXTRA_KCFLAGS"
 AC_SUBST(EXTRA_KCFLAGS)
 
-#echo "KCPPFLAGS: $KCPPFLAGS"
-#echo "KCFLAGS: $KCFLAGS"
-#echo "LLCPPFLAGS: $LLCPPFLAGS"
-#echo "LLCFLAGS: $LLCFLAGS"
-#echo "MOD_LINK: $MOD_LINK"
-#echo "CFLAGS: $CFLAGS"
-#echo "CPPFLAGS: $CPPFLAGS"
+echo "CPPFLAGS: $CPPFLAGS"
+echo "LLCPPFLAGS: $LLCPPFLAGS"
+echo "CFLAGS: $CFLAGS"
+echo "EXTRA_KCFLAGS: $EXTRA_KCFLAGS"
+echo "LLCFLAGS: $LLCFLAGS"
index e8a540a..f158396 100644 (file)
@@ -20,12 +20,14 @@ ac_default_prefix=/usr
 # mount.lustre
 rootsbindir='/sbin'
 AC_SUBST(rootsbindir)
+sysconfdir='/etc'
+AC_SUBST(sysconfdir)
 # Directories for documentation and demos.
 docdir='${datadir}/doc/$(PACKAGE)'
 AC_SUBST(docdir)
 demodir='$(docdir)/demo'
 AC_SUBST(demodir)
-pkgexampledir='${pkglibdir}/examples'
+pkgexampledir='${pkgdatadir}/examples'
 AC_SUBST(pkgexampledir)
 pymoddir='${pkglibdir}/python/Lustre'
 AC_SUBST(pymoddir)
@@ -59,6 +61,13 @@ case "$CC_VERSION" in
        "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)")
                bad_cc
                ;;
+       # unpatched 'gcc' on rh9.  miscompiles a
+       #        struct = (type) { .member = value, };
+       # asignment in the iibnal where the struct is a mix
+       # of u64 and u32 bit-fields.
+       "gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5)")
+               bad_cc
+               ;;
        *)
                AC_MSG_RESULT([no known problems])
                ;;
@@ -114,3 +123,5 @@ else
        LIBWRAP=""
 fi
 AC_SUBST(LIBWRAP)
+
+AC_SUBST(LIBS)
index b5f1041..4e24c71 100644 (file)
@@ -7,12 +7,6 @@
 #include <linux/libcfs.h>
 #define PORTAL_DEBUG
 
-#ifndef offsetof
-# define offsetof(typ,memb)     ((int)((char *)&(((typ *)0)->memb)))
-#endif
-
-#define LOWEST_BIT_SET(x)       ((x) & ~((x) - 1))
-
 #ifdef __KERNEL__
 # include <linux/vmalloc.h>
 # include <linux/time.h>
@@ -300,7 +294,6 @@ extern void kportal_blockallsigs (void);
 # include <unistd.h>
 # include <time.h>
 # include <limits.h>
-# include <asm/types.h>
 # ifndef DEBUG_SUBSYSTEM
 #  define DEBUG_SUBSYSTEM S_UNDEFINED
 # endif
@@ -308,7 +301,12 @@ extern void kportal_blockallsigs (void);
 #  undef NDEBUG
 #  include <assert.h>
 #  define LASSERT(e)     assert(e)
-#  define LASSERTF(cond, args...)     assert(cond)
+#  define LASSERTF(cond, args...)                                              \
+do {                                                                           \
+          if (!(cond))                                                         \
+                CERROR(args);                                                  \
+          assert(cond);                                                        \
+} while (0)
 # else
 #  define LASSERT(e)
 #  define LASSERTF(cond, args...) do { } while (0)
@@ -330,6 +328,7 @@ void portals_debug_dumplog(void);
 
 /* support decl needed both by kernel and liblustre */
 char *portals_nid2str(int nal, ptl_nid_t nid, char *str);
+char *portals_id2str(int nal, ptl_process_id_t nid, char *str);
 
 #ifndef CURRENT_TIME
 # define CURRENT_TIME time(0)
@@ -340,25 +339,37 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str);
  * Support for temporary event tracing with minimal Heisenberg effect. */
 #define LWT_SUPPORT  0
 
-#define LWT_MEMORY   (64<<20)
-#define LWT_MAX_CPUS 4
+#define LWT_MEMORY   (16<<20)
+
+#if !KLWT_SUPPORT
+# if defined(__KERNEL__)
+#  if !defined(BITS_PER_LONG)
+#   error "BITS_PER_LONG not defined"
+#  endif
+# elif !defined(__WORDSIZE)
+#  error "__WORDSIZE not defined"
+# else
+#  define BITS_PER_LONG __WORDSIZE
+# endif
 
+/* kernel hasn't defined this? */
 typedef struct {
-        cycles_t    lwte_when;
+        long long   lwte_when;
         char       *lwte_where;
         void       *lwte_task;
         long        lwte_p1;
         long        lwte_p2;
         long        lwte_p3;
         long        lwte_p4;
-#if BITS_PER_LONG > 32
+# if BITS_PER_LONG > 32
         long        lwte_pad;
-#endif
+# endif
 } lwt_event_t;
+#endif /* !KLWT_SUPPORT */
 
 #if LWT_SUPPORT
-#ifdef __KERNEL__
-#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t))
+# ifdef __KERNEL__
+#  if !KLWT_SUPPORT
 
 typedef struct _lwt_page {
         struct list_head     lwtp_list;
@@ -374,20 +385,13 @@ typedef struct {
 extern int       lwt_enabled;
 extern lwt_cpu_t lwt_cpus[];
 
-extern int  lwt_init (void);
-extern void lwt_fini (void);
-extern int  lwt_lookup_string (int *size, char *knlptr,
-                               char *usrptr, int usrsize);
-extern int  lwt_control (int enable, int clear);
-extern int  lwt_snapshot (cycles_t *now, int *ncpu, int *total_size,
-                          void *user_ptr, int user_size);
-
 /* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set.
  * This stuff is meant for finding specific problems; it never stays in
  * production code... */
 
 #define LWTSTR(n)       #n
 #define LWTWHERE(f,l)   f ":" LWTSTR(l)
+#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t))
 
 #define LWT_EVENT(p1, p2, p3, p4)                                       \
 do {                                                                    \
@@ -396,9 +400,9 @@ do {                                                                    \
         lwt_page_t      *p;                                             \
         lwt_event_t     *e;                                             \
                                                                         \
-        local_irq_save (flags);                                         \
-                                                                        \
         if (lwt_enabled) {                                              \
+                local_irq_save (flags);                                 \
+                                                                        \
                 cpu = &lwt_cpus[smp_processor_id()];                    \
                 p = cpu->lwtc_current_page;                             \
                 e = &p->lwtp_events[cpu->lwtc_current_index++];         \
@@ -417,13 +421,23 @@ do {                                                                    \
                 e->lwte_p2    = (long)(p2);                             \
                 e->lwte_p3    = (long)(p3);                             \
                 e->lwte_p4    = (long)(p4);                             \
-        }                                                               \
                                                                         \
-        local_irq_restore (flags);                                      \
+                local_irq_restore (flags);                              \
+        }                                                               \
 } while (0)
-#else  /* __KERNEL__ */
-#define LWT_EVENT(p1,p2,p3,p4)     /* no userland implementation yet */
-#endif /* __KERNEL__ */
+
+#endif /* !KLWT_SUPPORT */
+
+extern int  lwt_init (void);
+extern void lwt_fini (void);
+extern int  lwt_lookup_string (int *size, char *knlptr,
+                               char *usrptr, int usrsize);
+extern int  lwt_control (int enable, int clear);
+extern int  lwt_snapshot (cycles_t *now, int *ncpu, int *total_size,
+                          void *user_ptr, int user_size);
+# else  /* __KERNEL__ */
+#  define LWT_EVENT(p1,p2,p3,p4)     /* no userland implementation yet */
+# endif /* __KERNEL__ */
 #endif /* LWT_SUPPORT */
 
 struct portals_device_userstate
@@ -572,49 +586,42 @@ static inline int portal_ioctl_getdata(char *buf, char *end, void *arg)
         data = (struct portal_ioctl_data *)buf;
 
         err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
-        if ( err ) {
-                EXIT;
-                return err;
-        }
+        if (err)
+                RETURN(err);
 
         if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
-                CERROR ("PORTALS: version mismatch kernel vs application\n");
-                return -EINVAL;
+                CERROR("PORTALS: version mismatch kernel vs application\n");
+                RETURN(-EINVAL);
         }
 
         if (hdr->ioc_len + buf >= end) {
-                CERROR ("PORTALS: user buffer exceeds kernel buffer\n");
-                return -EINVAL;
+                CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+                RETURN(-EINVAL);
         }
 
 
         if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
-                CERROR ("PORTALS: user buffer too small for ioctl\n");
-                return -EINVAL;
+                CERROR("PORTALS: user buffer too small for ioctl\n");
+                RETURN(-EINVAL);
         }
 
         err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
-        if ( err ) {
-                EXIT;
-                return err;
-        }
+        if (err)
+                RETURN(err);
 
         if (portal_ioctl_is_invalid(data)) {
-                CERROR ("PORTALS: ioctl not correctly formatted\n");
-                return -EINVAL;
+                CERROR("PORTALS: ioctl not correctly formatted\n");
+                RETURN(-EINVAL);
         }
 
-        if (data->ioc_inllen1) {
+        if (data->ioc_inllen1)
                 data->ioc_inlbuf1 = &data->ioc_bulk[0];
-        }
 
-        if (data->ioc_inllen2) {
+        if (data->ioc_inllen2)
                 data->ioc_inlbuf2 = &data->ioc_bulk[0] +
                         size_round(data->ioc_inllen1);
-        }
 
-        EXIT;
-        return 0;
+        RETURN(0);
 }
 #endif
 
@@ -643,21 +650,13 @@ enum {
         GMNAL     = 3,
         /*          4 unused */
         TCPNAL    = 5,
-        SCIMACNAL = 6,
-        ROUTER    = 7,
-        IBNAL     = 8,
+        ROUTER    = 6,
+        OPENIBNAL = 7,
+        IIBNAL    = 8,
         NAL_ENUM_END_MARKER
 };
 
-#ifdef __KERNEL__
-extern ptl_handle_ni_t  kqswnal_ni;
-extern ptl_handle_ni_t  ksocknal_ni;
-extern ptl_handle_ni_t  kgmnal_ni;
-extern ptl_handle_ni_t  kibnal_ni;
-extern ptl_handle_ni_t  kscimacnal_ni;
-#endif
-
-#define PTL_NALFMT_SIZE               26 /* %u:%u.%u.%u.%u (10+4+4+4+3+1) */
+#define PTL_NALFMT_SIZE             32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */
 
 #define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
 
@@ -666,14 +665,18 @@ extern ptl_handle_ni_t  kscimacnal_ni;
 #define NAL_CMD_REGISTER_MYNID       102
 #define NAL_CMD_PUSH_CONNECTION      103
 #define NAL_CMD_GET_CONN             104
-#define NAL_CMD_DEL_AUTOCONN         105
-#define NAL_CMD_ADD_AUTOCONN         106
-#define NAL_CMD_GET_AUTOCONN         107
+#define NAL_CMD_DEL_PEER             105
+#define NAL_CMD_ADD_PEER             106
+#define NAL_CMD_GET_PEER             107
 #define NAL_CMD_GET_TXDESC           108
 #define NAL_CMD_ADD_ROUTE            109
 #define NAL_CMD_DEL_ROUTE            110
 #define NAL_CMD_GET_ROUTE            111
 #define NAL_CMD_NOTIFY_ROUTER        112
+#define NAL_CMD_ADD_INTERFACE        113
+#define NAL_CMD_DEL_INTERFACE        114
+#define NAL_CMD_GET_INTERFACE        115
+
 
 enum {
         DEBUG_DAEMON_START       =  1,
@@ -682,16 +685,6 @@ enum {
         DEBUG_DAEMON_CONTINUE    =  4,
 };
 
-/* XXX remove to lustre ASAP */
-struct lustre_peer {
-        ptl_nid_t       peer_nid;
-        ptl_handle_ni_t peer_ni;
-};
-
-/* module.c */
-typedef int (*nal_cmd_handler_t)(struct portals_cfg *, void * private);
-int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private);
-int kportal_nal_unregister(int nal);
 
 enum cfg_record_type {
         PORTALS_CFG_TYPE = 1,
@@ -699,10 +692,6 @@ enum cfg_record_type {
 };
 
 typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data);
-int kportal_nal_cmd(struct portals_cfg *);
-
-ptl_handle_ni_t *kportal_get_ni (int nal);
-void kportal_put_ni (int nal);
 
 #ifdef __CYGWIN__
 # ifndef BITS_PER_LONG
index ee50b59..1127698 100644 (file)
@@ -1,10 +1,10 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- *  * vim:expandtab:shiftwidth=8:tabstop=8:
- *   */
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
 #ifndef _KPR_H
 #define _KPR_H
 
-# include <portals/lib-nal.h> /* for ptl_hdr_t */
+# include <portals/lib-types.h> /* for ptl_hdr_t */
 
 /******************************************************************************/
 /* Kernel Portals Router interface */
@@ -81,21 +81,6 @@ typedef struct {
         void                    *kpr_arg;
 } kpr_router_t;
 
-/* Router's control interface (Kernel Portals Routing Control Interface) */
-typedef const struct {
-        int     (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid,
-                                   ptl_nid_t lo_nid, ptl_nid_t hi_nid);
-        int     (*kprci_del_route)(int gateway_nal, ptl_nid_t gateway_nid,
-                                   ptl_nid_t lo_nid, ptl_nid_t hi_nid);
-        int     (*kprci_get_route)(int index, int *gateway_nal,
-                                   ptl_nid_t *gateway,
-                                   ptl_nid_t *lo_nid, ptl_nid_t *hi_nid,
-                                   int *alive);
-        int     (*kprci_notify)(int gateway_nal, ptl_nid_t gateway_nid,
-                                int alive, time_t when);
-} kpr_control_interface_t;
-
-extern kpr_control_interface_t  kpr_control_interface;
 extern kpr_router_interface_t   kpr_router_interface;
 
 static inline int
index 66ee471..e3d58dd 100644 (file)
@@ -4,14 +4,60 @@
 #ifndef _LIBCFS_H
 #define _LIBCFS_H
 
+#ifdef HAVE_ASM_TYPES_H
+#include <asm/types.h>
+#else
+#include "types.h"
+#endif
+
+#ifdef __KERNEL__
+# include <linux/time.h>
+# include <asm/timex.h>
+#else
+# include <sys/time.h>
+# define do_gettimeofday(tv) gettimeofday(tv, NULL);
+typedef unsigned long long cycles_t;
+#endif
+
 #define PORTAL_DEBUG
 
 #ifndef offsetof
-# define offsetof(typ,memb)     ((int)((char *)&(((typ *)0)->memb)))
+# define offsetof(typ,memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
 #endif
 
 #define LOWEST_BIT_SET(x)       ((x) & ~((x) - 1))
 
+#ifndef __KERNEL__
+/* Userpace byte flipping */
+# include <endian.h>
+# include <byteswap.h>
+# define __swab16(x) bswap_16(x)
+# define __swab32(x) bswap_32(x)
+# define __swab64(x) bswap_64(x)
+# define __swab16s(x) do {*(x) = bswap_16(*(x));} while (0)
+# define __swab32s(x) do {*(x) = bswap_32(*(x));} while (0)
+# define __swab64s(x) do {*(x) = bswap_64(*(x));} while (0)
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+#  define le16_to_cpu(x) (x)
+#  define cpu_to_le16(x) (x)
+#  define le32_to_cpu(x) (x)
+#  define cpu_to_le32(x) (x)
+#  define le64_to_cpu(x) (x)
+#  define cpu_to_le64(x) (x)
+# else
+#  if __BYTE_ORDER == __BIG_ENDIAN
+#   define le16_to_cpu(x) bswap_16(x)
+#   define cpu_to_le16(x) bswap_16(x)
+#   define le32_to_cpu(x) bswap_32(x)
+#   define cpu_to_le32(x) bswap_32(x)
+#   define le64_to_cpu(x) bswap_64(x)
+#   define cpu_to_le64(x) bswap_64(x)
+#  else
+#   error "Unknown byte order"
+#  endif /* __BIG_ENDIAN */
+# endif /* __LITTLE_ENDIAN */
+#endif /* ! __KERNEL__ */
+
 /*
  *  Debugging
  */
@@ -20,7 +66,6 @@ extern unsigned int portal_stack;
 extern unsigned int portal_debug;
 extern unsigned int portal_printk;
 
-#include <asm/types.h>
 struct ptldebug_header {
         __u32 ph_len;
         __u32 ph_flags;
@@ -60,7 +105,10 @@ struct ptldebug_header {
 #define S_GMNAL       0x00080000
 #define S_PTLROUTER   0x00100000
 #define S_COBD        0x00200000
-#define S_IBNAL       0x00400000
+#define S_IBNAL       0x00400000 /* All IB NALs */
+#define S_SM          0x00800000
+#define S_ASOBD       0x01000000
+#define S_CONFOBD     0x02000000
 
 /* If you change these values, please keep portals/utils/debug.c
  * up to date! */
@@ -109,7 +157,7 @@ struct ptldebug_header {
 #  define CDEBUG_STACK (THREAD_SIZE -                                      \
                         ((unsigned long)__builtin_frame_address(0) &       \
                          (THREAD_SIZE - 1)))
-# endif
+# endif /* __ia64__ */
 
 #define CHECK_STACK(stack)                                                    \
         do {                                                                  \
@@ -121,7 +169,7 @@ struct ptldebug_header {
                       /*panic("LBUG");*/                                      \
                 }                                                             \
         } while (0)
-#else /* __KERNEL__ */
+#else /* !__KERNEL__ */
 #define CHECK_STACK(stack) do { } while(0)
 #define CDEBUG_STACK (0L)
 #endif /* __KERNEL__ */
@@ -152,14 +200,14 @@ do {                                                                          \
                 if (cdebug_count) {                                           \
                         portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask,       \
                                           __FILE__, __FUNCTION__, __LINE__,   \
-                                          0, cdebug_format, ## a);            \
+                                          CDEBUG_STACK, cdebug_format, ## a); \
                         cdebug_count = 0;                                     \
                 }                                                             \
                 if (time_after(jiffies, cdebug_next+(CDEBUG_MAX_LIMIT+10)*HZ))\
                         cdebug_delay = cdebug_delay > 8 ? cdebug_delay/8 : 1; \
                 else                                                          \
-                        cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ\
-                                CDEBUG_MAX_LIMIT * HZ : cdebug_delay*2;       \
+                        cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ ?\
+                                        CDEBUG_MAX_LIMIT*HZ : cdebug_delay*2; \
                 cdebug_next = jiffies + cdebug_delay;                         \
         } else {                                                              \
                 portals_debug_msg(DEBUG_SUBSYSTEM,                            \
@@ -202,15 +250,33 @@ do {                                                                    \
 } while(0)
 #else
 #define CDEBUG(mask, format, a...)      do { } while (0)
-#define CWARN(format, a...)             do { } while (0)
-#define CERROR(format, a...)            printk("<3>" format, ## a)
-#define CEMERG(format, a...)            printk("<0>" format, ## a)
+#define CWARN(format, a...)             printk(KERN_WARNING format, ## a)
+#define CERROR(format, a...)            printk(KERN_ERR format, ## a)
+#define CEMERG(format, a...)            printk(KERN_EMERG format, ## a)
 #define GOTO(label, rc)                 do { (void)(rc); goto label; } while (0)
 #define RETURN(rc)                      return (rc)
 #define ENTRY                           do { } while (0)
 #define EXIT                            do { } while (0)
 #endif
 
+/* initial pid  */
+# if CRAY_PORTALS
+/* 
+ * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this
+ *    is too big.
+ *
+ * 2) the implementation of ernal in cray portals further restricts the pid
+ *    space that may be used to 0 <= pid <= 255 (an 8 bit value).  Returns
+ *    an error at nal init time for any pid outside this range.  Other nals
+ *    in cray portals don't have this restriction.
+ * */
+#define LUSTRE_PTL_PID          9
+# else
+#define LUSTRE_PTL_PID          12345
+# endif
+
+#define LUSTRE_SRV_PTL_PID      LUSTRE_PTL_PID    
+
 #define PORTALS_CFG_VERSION 0x00010001;
 
 struct portals_cfg {
@@ -245,6 +311,11 @@ do {                                                    \
                                                         \
 } while (0)
 
+typedef int (nal_cmd_handler_fn)(struct portals_cfg *, void *);
+int libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *arg);
+int libcfs_nal_cmd(struct portals_cfg *pcfg);
+void libcfs_nal_cmd_unregister(int nal);
+
 struct portal_ioctl_data {
         __u32 ioc_len;
         __u32 ioc_version;
@@ -277,6 +348,7 @@ struct portal_ioctl_data {
         char ioc_bulk[0];
 };
 
+
 #ifdef __KERNEL__
 
 #include <linux/list.h>
index 3d0aff0..5a43a45 100644 (file)
@@ -30,6 +30,7 @@
 # define CURRENT_SECONDS           get_seconds()
 # define smp_num_cpus              NR_CPUS
 
+
 #elif defined(CONFIG_RH_2_4_20) /* RH 2.4.x */
 
 # define SIGNAL_MASK_LOCK(task, flags)                                  \
index 609290d..8778a52 100644 (file)
@@ -64,114 +64,6 @@ static inline size_t round_strlen(char *fset)
         return size_round(strlen(fset) + 1);
 }
 
-#ifdef __KERNEL__
-static inline char *strdup(const char *str)
-{
-        int len = strlen(str) + 1;
-        char *tmp = kmalloc(len, GFP_KERNEL);
-        if (tmp)
-                memcpy(tmp, str, len);
-
-        return tmp;
-}
-#endif
-
-#ifdef __KERNEL__
-# define NTOH__u32(var) le32_to_cpu(var)
-# define NTOH__u64(var) le64_to_cpu(var)
-# define HTON__u32(var) cpu_to_le32(var)
-# define HTON__u64(var) cpu_to_le64(var)
-#else
-# define expansion_u64(var) \
-    ({  __u64 ret; \
-       switch (sizeof(var)) {   \
-       case 8: (ret) = (var); break; \
-       case 4: (ret) = (__u32)(var); break; \
-       case 2: (ret) = (__u16)(var); break; \
-       case 1: (ret) = (__u8)(var); break; \
-       };       \
-       (ret);     \
-    })
-# define NTOH__u32(var) (var)
-# define NTOH__u64(var) (expansion_u64(var))
-# define HTON__u32(var) (var)
-# define HTON__u64(var) (expansion_u64(var))
-#endif
-
-/* 
- * copy sizeof(type) bytes from pointer to var and move ptr forward.
- * return EFAULT if pointer goes beyond end
- */
-#define UNLOGV(var,type,ptr,end)                \
-do {                                            \
-        var = *(type *)ptr;                     \
-        ptr += sizeof(type);                    \
-        if (ptr > end )                         \
-                return -EFAULT;                 \
-} while (0)
-
-/* the following two macros convert to little endian */
-/* type MUST be __u32 or __u64 */
-#define LUNLOGV(var,type,ptr,end)               \
-do {                                            \
-        var = NTOH##type(*(type *)ptr);         \
-        ptr += sizeof(type);                    \
-        if (ptr > end )                         \
-                return -EFAULT;                 \
-} while (0)
-
-/* now log values */
-#define LOGV(var,type,ptr)                      \
-do {                                            \
-        *((type *)ptr) = var;                   \
-        ptr += sizeof(type);                    \
-} while (0)
-
-/* and in network order */
-#define LLOGV(var,type,ptr)                     \
-do {                                            \
-        *((type *)ptr) = HTON##type(var);       \
-        ptr += sizeof(type);                    \
-} while (0)
-
-
-/* 
- * set var to point at (type *)ptr, move ptr forward with sizeof(type)
- * return from function with EFAULT if ptr goes beyond end
- */
-#define UNLOGP(var,type,ptr,end)                \
-do {                                            \
-        var = (type *)ptr;                      \
-        ptr += sizeof(type);                    \
-        if (ptr > end )                         \
-                return -EFAULT;                 \
-} while (0)
-
-#define LOGP(var,type,ptr)                      \
-do {                                            \
-        memcpy(ptr, var, sizeof(type));         \
-        ptr += sizeof(type);                    \
-} while (0)
-
-/* 
- * set var to point at (char *)ptr, move ptr forward by size_round(len);
- * return from function with EFAULT if ptr goes beyond end
- */
-#define UNLOGL(var,type,len,ptr,end)            \
-do {                                            \
-        var = (type *)ptr;                      \
-        ptr += size_round(len * sizeof(type));  \
-        if (ptr > end )                         \
-                return -EFAULT;                 \
-} while (0)
-
-#define UNLOGL0(var,type,len,ptr,end)                                   \
-do {                                                                    \
-        UNLOGL(var,type,len,ptr,end);                                   \
-        if ( *((char *)ptr - size_round(len) + len - 1) != '\0')        \
-                return -EFAULT;                                         \
-} while (0)
-
 #define LOGL(var,len,ptr)                                       \
 do {                                                            \
         if (var)                                                \
index 5ed6090..4043f66 100644 (file)
@@ -4,7 +4,7 @@ if UTILS
 portals_HEADERS = list.h
 endif
 
-EXTRA_DIST = api.h api-support.h arg-blocks.h defines.h errno.h                \
-       internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h       \
-       list.h lltrace.h myrnal.h nal.h nalids.h p30.h ppid.h ptlctl.h  \
+EXTRA_DIST = api.h api-support.h build_check.h errno.h \
+       internal.h lib-p30.h lib-types.h list.h                  \
+       lltrace.h myrnal.h nal.h nalids.h p30.h ptlctl.h         \
        socknal.h stringtab.h types.h
index af4a2dc..c5994c6 100644 (file)
@@ -1,5 +1,5 @@
-# define DEBUG_SUBSYSTEM S_PORTALS
-# define PORTAL_DEBUG
+
+#include "build_check.h"
 
 #ifndef __KERNEL__
 # include <stdio.h>
@@ -19,9 +19,4 @@
 
 #include <portals/internal.h>
 #include <portals/nal.h>
-#include <portals/arg-blocks.h>
 
-/* Hack for 2.4.18 macro name collision */
-#ifdef yield
-#undef yield
-#endif
index a83749b..56b7b99 100644 (file)
@@ -1,56 +1,39 @@
 #ifndef P30_API_H
 #define P30_API_H
 
+#include "build_check.h"
+
 #include <portals/types.h>
 
-#ifndef PTL_NO_WRAP
-int PtlInit(void);
-int PtlInitialized(void);
+int PtlInit(int *);
 void PtlFini(void);
 
-int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size_in,
-              ptl_ac_index_t acl_size_in, ptl_pid_t requested_pid,
-              ptl_handle_ni_t * interface_out);
+int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid,
+             ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits,
+              ptl_handle_ni_t *interface_out);
 
 int PtlNIInitialized(ptl_interface_t);
 
 int PtlNIFini(ptl_handle_ni_t interface_in);
 
-#endif
-
 int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id);
 
+int PtlGetUid(ptl_handle_ni_t ni_handle, ptl_uid_t *uid);
+
 
 /*
  * Network interfaces
  */
 
-#ifndef PTL_NO_WRAP
-int PtlNIBarrier(ptl_handle_ni_t interface_in);
-#endif
-
 int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
                 ptl_sr_value_t * status_out);
 
 int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
               unsigned long *distance_out);
 
-#ifndef PTL_NO_WRAP
 int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out);
-#endif
 
 
-/*
- * PtlNIDebug: 
- *
- * This is not an official Portals 3 API call.  It is provided
- * by the reference implementation to allow the maintainers an
- * easy way to turn on and off debugging information in the
- * library.  Do not use it in code that is not intended for use
- * with any version other than the portable reference library.
- */
-unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in);
-
 /* 
  * PtlNIFailNid
  *
@@ -62,6 +45,13 @@ unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in);
  */
 int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold);
 
+/*
+ * PtlSnprintHandle: 
+ *
+ * This is not an official Portals 3 API call.  It is provided
+ * so that an application can print an opaque handle.
+ */
+void PtlSnprintHandle (char *str, int str_len, ptl_handle_any_t handle);
 
 /*
  * Match entries
@@ -81,28 +71,23 @@ int PtlMEUnlink(ptl_handle_me_t current_in);
 
 int PtlMEUnlinkList(ptl_handle_me_t current_in);
 
-int PtlTblDump(ptl_handle_ni_t ni, int index_in);
-int PtlMEDump(ptl_handle_me_t current_in);
-
 
 
 /*
  * Memory descriptors
  */
 
-#ifndef PTL_NO_WRAP
 int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in,
                 ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out);
 
 int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
-              ptl_handle_md_t * handle_out);
+             ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out);
 
 int PtlMDUnlink(ptl_handle_md_t md_in);
 
 int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout,
                 ptl_md_t * new_inout, ptl_handle_eq_t testq_in);
 
-#endif
 
 /* These should not be called by users */
 int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout,
@@ -115,24 +100,18 @@ int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout,
 /*
  * Event queues
  */
-#ifndef PTL_NO_WRAP
-
-/* These should be called by users */
 int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in,
-               int (*callback) (ptl_event_t * event),
-               ptl_handle_eq_t * handle_out);
+               ptl_eq_handler_t handler,
+               ptl_handle_eq_t *handle_out);
 int PtlEQFree(ptl_handle_eq_t eventq_in);
 
-int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out);
-
 int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
 
 
 int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
 
-int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
-                      int timeout);
-#endif
+int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout,
+             ptl_event_t *event_out, int *which_out);
 
 /*
  * Access Control Table
diff --git a/lustre/portals/include/portals/arg-blocks.h b/lustre/portals/include/portals/arg-blocks.h
deleted file mode 100644 (file)
index 3c3b154..0000000
+++ /dev/null
@@ -1,265 +0,0 @@
-#ifndef PTL_BLOCKS_H
-#define PTL_BLOCKS_H
-
-/*
- * blocks.h
- *
- * Argument block types for the Portals 3.0 library
- * Generated by idl
- *
- */
-
-#include <portals/types.h>
-
-/* put LIB_MAX_DISPATCH last here  -- these must match the
-   assignements to the dispatch table in lib-p30/dispatch.c */
-#define PTL_GETID     1
-#define PTL_NISTATUS  2
-#define PTL_NIDIST    3
-#define PTL_NIDEBUG   4
-#define PTL_MEATTACH  5
-#define PTL_MEINSERT  6
-// #define PTL_MEPREPEND 7
-#define PTL_MEUNLINK  8
-#define PTL_TBLDUMP   9 
-#define PTL_MEDUMP   10
-#define PTL_MDATTACH 11
-// #define PTL_MDINSERT 12
-#define PTL_MDBIND   13
-#define PTL_MDUPDATE 14
-#define PTL_MDUNLINK 15
-#define PTL_EQALLOC  16
-#define PTL_EQFREE   17
-#define PTL_ACENTRY  18
-#define PTL_PUT      19 
-#define PTL_GET      20
-#define PTL_FAILNID  21
-#define LIB_MAX_DISPATCH 21
-
-typedef struct PtlFailNid_in {
-       ptl_handle_ni_t interface;
-       ptl_nid_t       nid;
-       unsigned int    threshold;
-} PtlFailNid_in;
-
-typedef struct PtlFailNid_out {
-       int             rc;
-} PtlFailNid_out;
-
-typedef struct PtlGetId_in {
-        ptl_handle_ni_t handle_in;
-} PtlGetId_in;
-
-typedef struct PtlGetId_out {
-        int rc;
-        ptl_process_id_t id_out;
-} PtlGetId_out;
-
-typedef struct PtlNIStatus_in {
-        ptl_handle_ni_t interface_in;
-        ptl_sr_index_t register_in;
-} PtlNIStatus_in;
-
-typedef struct PtlNIStatus_out {
-        int rc;
-        ptl_sr_value_t status_out;
-} PtlNIStatus_out;
-
-
-typedef struct PtlNIDist_in {
-        ptl_handle_ni_t interface_in;
-        ptl_process_id_t process_in;
-} PtlNIDist_in;
-
-typedef struct PtlNIDist_out {
-        int rc;
-        unsigned long distance_out;
-} PtlNIDist_out;
-
-
-typedef struct PtlNIDebug_in {
-        unsigned int mask_in;
-} PtlNIDebug_in;
-
-typedef struct PtlNIDebug_out {
-        unsigned int rc;
-} PtlNIDebug_out;
-
-
-typedef struct PtlMEAttach_in {
-        ptl_handle_ni_t interface_in;
-        ptl_pt_index_t index_in;
-        ptl_ins_pos_t position_in;
-        ptl_process_id_t match_id_in;
-        ptl_match_bits_t match_bits_in;
-        ptl_match_bits_t ignore_bits_in;
-        ptl_unlink_t unlink_in;
-} PtlMEAttach_in;
-
-typedef struct PtlMEAttach_out {
-        int rc;
-        ptl_handle_me_t handle_out;
-} PtlMEAttach_out;
-
-
-typedef struct PtlMEInsert_in {
-        ptl_handle_me_t current_in;
-        ptl_process_id_t match_id_in;
-        ptl_match_bits_t match_bits_in;
-        ptl_match_bits_t ignore_bits_in;
-        ptl_unlink_t unlink_in;
-        ptl_ins_pos_t position_in;
-} PtlMEInsert_in;
-
-typedef struct PtlMEInsert_out {
-        int rc;
-        ptl_handle_me_t handle_out;
-} PtlMEInsert_out;
-
-typedef struct PtlMEUnlink_in {
-        ptl_handle_me_t current_in;
-        ptl_unlink_t unlink_in;
-} PtlMEUnlink_in;
-
-typedef struct PtlMEUnlink_out {
-        int rc;
-} PtlMEUnlink_out;
-
-
-typedef struct PtlTblDump_in {
-        int index_in;
-} PtlTblDump_in;
-
-typedef struct PtlTblDump_out {
-        int rc;
-} PtlTblDump_out;
-
-
-typedef struct PtlMEDump_in {
-        ptl_handle_me_t current_in;
-} PtlMEDump_in;
-
-typedef struct PtlMEDump_out {
-        int rc;
-} PtlMEDump_out;
-
-
-typedef struct PtlMDAttach_in {
-        ptl_handle_me_t me_in;
-        ptl_handle_eq_t eq_in;
-        ptl_md_t md_in;
-        ptl_unlink_t unlink_in;
-} PtlMDAttach_in;
-
-typedef struct PtlMDAttach_out {
-        int rc;
-        ptl_handle_md_t handle_out;
-} PtlMDAttach_out;
-
-
-typedef struct PtlMDBind_in {
-        ptl_handle_ni_t ni_in;
-        ptl_handle_eq_t eq_in;
-        ptl_md_t md_in;
-} PtlMDBind_in;
-
-typedef struct PtlMDBind_out {
-        int rc;
-        ptl_handle_md_t handle_out;
-} PtlMDBind_out;
-
-
-typedef struct PtlMDUpdate_internal_in {
-        ptl_handle_md_t md_in;
-        ptl_handle_eq_t testq_in;
-        ptl_seq_t sequence_in;
-
-        ptl_md_t old_inout;
-        int old_inout_valid;
-        ptl_md_t new_inout;
-        int new_inout_valid;
-} PtlMDUpdate_internal_in;
-
-typedef struct PtlMDUpdate_internal_out {
-        int rc;
-        ptl_md_t old_inout;
-        ptl_md_t new_inout;
-} PtlMDUpdate_internal_out;
-
-
-typedef struct PtlMDUnlink_in {
-        ptl_handle_md_t md_in;
-} PtlMDUnlink_in;
-
-typedef struct PtlMDUnlink_out {
-        int rc;
-        ptl_md_t status_out;
-} PtlMDUnlink_out;
-
-
-typedef struct PtlEQAlloc_in {
-        ptl_handle_ni_t ni_in;
-        ptl_size_t count_in;
-        void *base_in;
-        int len_in;
-        int (*callback_in) (ptl_event_t * event);
-} PtlEQAlloc_in;
-
-typedef struct PtlEQAlloc_out {
-        int rc;
-        ptl_handle_eq_t handle_out;
-} PtlEQAlloc_out;
-
-
-typedef struct PtlEQFree_in {
-        ptl_handle_eq_t eventq_in;
-} PtlEQFree_in;
-
-typedef struct PtlEQFree_out {
-        int rc;
-} PtlEQFree_out;
-
-
-typedef struct PtlACEntry_in {
-        ptl_handle_ni_t ni_in;
-        ptl_ac_index_t index_in;
-        ptl_process_id_t match_id_in;
-        ptl_pt_index_t portal_in;
-} PtlACEntry_in;
-
-typedef struct PtlACEntry_out {
-        int rc;
-} PtlACEntry_out;
-
-
-typedef struct PtlPut_in {
-        ptl_handle_md_t md_in;
-        ptl_ack_req_t ack_req_in;
-        ptl_process_id_t target_in;
-        ptl_pt_index_t portal_in;
-        ptl_ac_index_t cookie_in;
-        ptl_match_bits_t match_bits_in;
-        ptl_size_t offset_in;
-        ptl_hdr_data_t hdr_data_in;
-} PtlPut_in;
-
-typedef struct PtlPut_out {
-        int rc;
-} PtlPut_out;
-
-
-typedef struct PtlGet_in {
-        ptl_handle_md_t md_in;
-        ptl_process_id_t target_in;
-        ptl_pt_index_t portal_in;
-        ptl_ac_index_t cookie_in;
-        ptl_match_bits_t match_bits_in;
-        ptl_size_t offset_in;
-} PtlGet_in;
-
-typedef struct PtlGet_out {
-        int rc;
-} PtlGet_out;
-
-
-#endif
index 5db1352..c219d2a 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef _BUILD_CHECK_H
 #define _BUILD_CHECK_H
 
-#ifdef CRAY_PORTALS
+#if CRAY_PORTALS
 #error "an application got to me instead of cray's includes"
 #endif
 
diff --git a/lustre/portals/include/portals/defines.h b/lustre/portals/include/portals/defines.h
deleted file mode 100644 (file)
index 785ce73..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
-**
-** This files contains definitions that are used throughout the cplant code.
-*/
-
-#ifndef CPLANT_H
-#define CPLANT_H
-
-#define TITLE(fname,zmig)
-
-
-/*
-** TRUE and FALSE
-*/
-#undef TRUE
-#define TRUE           (1)
-#undef FALSE
-#define FALSE          (0)
-
-
-/*
-** Return codes from functions
-*/
-#undef OK
-#define OK             (0)
-#undef ERROR
-#define ERROR          (-1)
-
-
-
-/*
-** The GCC macro for a safe max() that works on all types arithmetic types.
-*/
-#ifndef MAX
-#define MAX(a, b)      (a) > (b) ? (a) : (b)
-#endif /* MAX */
-
-#ifndef MIN
-#define MIN(a, b)      (a) < (b) ? (a) : (b)
-#endif /* MIN */
-
-/*
-** The rest is from the old qkdefs.h
-*/
-
-#ifndef __linux__
-#define __inline__
-#endif
-
-#ifndef NULL
-#define NULL ((void *)0)
-#endif
-
-#ifndef __osf__
-#define PRIVATE static
-#define PUBLIC
-#endif
-
-#ifndef __osf__
-typedef unsigned char           uchar;
-#endif
-
-typedef char                    CHAR;
-typedef unsigned char           UCHAR;
-typedef char                    INT8;
-typedef unsigned char           UINT8;
-typedef short int               INT16;
-typedef unsigned short int      UINT16;
-typedef int                     INT32;
-typedef unsigned int            UINT32;
-typedef long                    LONG32;
-typedef unsigned long           ULONG32;
-
-/* long may be 32 or 64, so we can't really append the size to the definition */
-typedef long                    LONG;
-typedef unsigned long           ULONG;
-
-#ifdef __alpha__
-typedef long int_t;
-#ifndef __osf__
-typedef unsigned long uint_t;
-#endif
-#endif
-
-#ifdef __i386__
-typedef int int_t;
-typedef unsigned int uint_t;
-#endif
-
-typedef float                   FLOAT32;
-typedef double                  FLOAT64;
-typedef void                    VOID;
-typedef INT32                   BOOLEAN;
-typedef void (*FCN_PTR)(void);
-
-#ifndef off64_t
-
-#if defined (__alpha__) || defined (__ia64__)
-typedef long                     off64_t;
-#else
-typedef long long                off64_t;
-#endif
-
-#endif
-
-/*
-** Process related typedefs
-*/
-typedef UINT16 PID_TYPE;  /* Type of Local process ID */
-typedef UINT16 NID_TYPE;  /* Type of Physical node ID */
-typedef UINT16 GID_TYPE;  /* Type of Group ID */
-typedef UINT16 RANK_TYPE; /* Type of Logical rank/process within a group */
-
-
-
-#endif /* CPLANT_H */
index 08f084a..42f2626 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef _P30_ERRNO_H_
 #define _P30_ERRNO_H_
 
+#include "build_check.h"
 /*
  * include/portals/errno.h
  *
 
 /* If you change these, you must update the string table in api-errno.c */
 typedef enum {
-        PTL_OK              = 0,
-        PTL_SEGV            = 1,
-
-        PTL_NOSPACE         = 2,
-        PTL_INUSE           = 3,
-        PTL_VAL_FAILED      = 4,
-
-        PTL_NAL_FAILED      = 5,
-        PTL_NOINIT          = 6,
-        PTL_INIT_DUP        = 7,
-        PTL_INIT_INV        = 8,
-        PTL_AC_INV_INDEX    = 9,
-
-        PTL_INV_ASIZE       = 10,
-        PTL_INV_HANDLE      = 11,
-        PTL_INV_MD          = 12,
-        PTL_INV_ME          = 13,
-        PTL_INV_NI          = 14,
+        PTL_OK                 = 0,
+        PTL_SEGV               = 1,
+
+        PTL_NO_SPACE           = 2,
+        PTL_ME_IN_USE          = 3,
+        PTL_VAL_FAILED         = 4,
+
+        PTL_NAL_FAILED         = 5,
+        PTL_NO_INIT            = 6,
+        PTL_IFACE_DUP          = 7,
+        PTL_IFACE_INVALID      = 8,
+
+        PTL_HANDLE_INVALID     = 9,
+        PTL_MD_INVALID         = 10,
+        PTL_ME_INVALID         = 11,
 /* If you change these, you must update the string table in api-errno.c */
-        PTL_ILL_MD          = 15,
-        PTL_INV_PROC        = 16,
-        PTL_INV_PSIZE       = 17,
-        PTL_INV_PTINDEX     = 18,
-        PTL_INV_REG         = 19,
-
-        PTL_INV_SR_INDX     = 20,
-        PTL_ML_TOOLONG      = 21,
-        PTL_ADDR_UNKNOWN    = 22,
-        PTL_INV_EQ          = 23,
-        PTL_EQ_DROPPED      = 24,
-
-        PTL_EQ_EMPTY        = 25,
-        PTL_NOUPDATE        = 26,
-        PTL_FAIL            = 27,
-        PTL_NOT_IMPLEMENTED = 28,
-        PTL_NO_ACK          = 29,
-
-        PTL_IOV_TOO_MANY    = 30,
-        PTL_IOV_TOO_SMALL   = 31,
-
-       PTL_EQ_INUSE        = 32,
-
-        PTL_MAX_ERRNO       = 32
+        PTL_PROCESS_INVALID    = 12,
+        PTL_PT_INDEX_INVALID   = 13,
+
+        PTL_SR_INDEX_INVALID   = 14,
+        PTL_EQ_INVALID         = 15,
+        PTL_EQ_DROPPED         = 16,
+
+        PTL_EQ_EMPTY           = 17,
+        PTL_MD_NO_UPDATE       = 18,
+        PTL_FAIL               = 19,
+
+        PTL_IOV_INVALID        = 20,
+
+       PTL_EQ_IN_USE           = 21,
+
+       PTL_NI_INVALID          = 22,
+       PTL_MD_ILLEGAL          = 23,
+       
+        PTL_MAX_ERRNO          = 24
 } ptl_err_t;
 /* If you change these, you must update the string table in api-errno.c */
 
diff --git a/lustre/portals/include/portals/lib-dispatch.h b/lustre/portals/include/portals/lib-dispatch.h
deleted file mode 100644 (file)
index f87ff83..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef PTL_DISPATCH_H
-#define PTL_DISPATCH_H
-
-/*
- * include/dispatch.h
- *
- * Dispatch table header and externs for remote side
- * operations
- *
- * Generated by idl
- *
- */
-
-#include <portals/lib-p30.h>
-#include <portals/arg-blocks.h>
-
-extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlNIDebug(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args,
-                           void *ret);
-extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args,
-                                   void *ret);
-extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args,
-                                 void *ret);
-extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args,
-                                   void *ret);
-extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args,
-                                   void *ret);
-extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args,
-                                  void *ret);
-extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args,
-                                 void *ret);
-extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret);
-extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret);
-
-extern char *dispatch_name(int index);
-#endif
diff --git a/lustre/portals/include/portals/lib-nal.h b/lustre/portals/include/portals/lib-nal.h
deleted file mode 100644 (file)
index 0bf557e..0000000
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef _LIB_NAL_H_
-#define _LIB_NAL_H_
-
-/*
- * nal.h
- *
- * Library side headers that define the abstraction layer's
- * responsibilities and interfaces
- */
-
-#include <portals/lib-types.h>
-
-struct nal_cb_t {
-       /*
-        * Per interface portal table, access control table
-        * and NAL private data field;
-        */
-       lib_ni_t ni;
-       void *nal_data;
-       /*
-        * send: Sends a preformatted header and payload data to a
-        * specified remote process. The payload is scattered over 'niov'
-        * fragments described by iov, starting at 'offset' for 'mlen'
-        * bytes.  
-        * NB the NAL may NOT overwrite iov.  
-        * PTL_OK on success => NAL has committed to send and will call
-        * lib_finalize on completion
-        */
-       ptl_err_t (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
-                             ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
-                             unsigned int niov, struct iovec *iov, 
-                             size_t offset, size_t mlen);
-
-       /* as send, but with a set of page fragments (NULL if not supported) */
-       ptl_err_t (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
-                                   ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
-                                   unsigned int niov, ptl_kiov_t *iov, 
-                                   size_t offset, size_t mlen);
-       /*
-        * recv: Receives an incoming message from a remote process.  The
-        * payload is to be received into the scattered buffer of 'niov'
-        * fragments described by iov, starting at 'offset' for 'mlen'
-        * bytes.  Payload bytes after 'mlen' up to 'rlen' are to be
-        * discarded.  
-        * NB the NAL may NOT overwrite iov.
-        * PTL_OK on success => NAL has committed to receive and will call
-        * lib_finalize on completion
-        */
-       ptl_err_t (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
-                             unsigned int niov, struct iovec *iov, 
-                             size_t offset, size_t mlen, size_t rlen);
-
-       /* as recv, but with a set of page fragments (NULL if not supported) */
-       ptl_err_t (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
-                                   unsigned int niov, ptl_kiov_t *iov, 
-                                   size_t offset, size_t mlen, size_t rlen);
-       /*
-        * read: Reads a block of data from a specified user address
-        */
-       ptl_err_t (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
-                             user_ptr src_addr, size_t len);
-
-       /*
-        * write: Writes a block of data into a specified user address
-        */
-       ptl_err_t (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
-                              void *src_addr, size_t len);
-
-       /*
-        * callback: Calls an event callback
-        * NULL => lib calls eq's callback (if any) directly.
-        */
-       void (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
-                            ptl_event_t *ev);
-
-       /*
-        *  malloc: Acquire a block of memory in a system independent
-        * fashion.
-        */
-       void *(*cb_malloc) (nal_cb_t * nal, size_t len);
-
-       void (*cb_free) (nal_cb_t * nal, void *buf, size_t len);
-
-       /*
-        * (un)map: Tell the NAL about some memory it will access.
-        * *addrkey passed to cb_unmap() is what cb_map() set it to.
-        * type of *iov depends on options.
-        * Set to NULL if not required.
-        */
-       ptl_err_t (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
-                            void **addrkey);
-       void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
-                         void **addrkey);
-
-       /* as (un)map, but with a set of page fragments */
-       ptl_err_t (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
-                                  void **addrkey);
-       void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
-                         void **addrkey);
-
-       void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...);
-
-       /* Turn interrupts off (begin of protected area) */
-       void (*cb_cli) (nal_cb_t * nal, unsigned long *flags);
-
-       /* Turn interrupts on (end of protected area) */
-       void (*cb_sti) (nal_cb_t * nal, unsigned long *flags);
-
-       /*
-        * Calculate a network "distance" to given node
-        */
-       int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist);
-};
-
-#endif
index b1a6e04..4daf219 100644 (file)
@@ -9,19 +9,21 @@
 #ifndef _LIB_P30_H_
 #define _LIB_P30_H_
 
+#include "build_check.h"
+
 #ifdef __KERNEL__
 # include <asm/page.h>
 # include <linux/string.h>
 #else
 # include <portals/list.h>
 # include <string.h>
+# include <pthread.h>
 #endif
 #include <portals/types.h>
 #include <linux/kp30.h>
 #include <portals/p30.h>
+#include <portals/nal.h>
 #include <portals/lib-types.h>
-#include <portals/lib-nal.h>
-#include <portals/lib-dispatch.h>
 
 static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh)
 {
@@ -29,17 +31,18 @@ static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh)
                 wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
 }
 
-#define state_lock(nal,flagsp)                          \
-do {                                                    \
-        CDEBUG(D_PORTALS, "taking state lock\n");       \
-        nal->cb_cli(nal, flagsp);                       \
-} while (0)
+#ifdef __KERNEL__
+#define LIB_LOCK(nal,flags)                                     \
+        spin_lock_irqsave(&(nal)->libnal_ni.ni_lock, flags)
+#define LIB_UNLOCK(nal,flags)                                   \
+        spin_unlock_irqrestore(&(nal)->libnal_ni.ni_lock, flags)
+#else
+#define LIB_LOCK(nal,flags)                                             \
+        (pthread_mutex_lock(&(nal)->libnal_ni.ni_mutex), (flags) = 0)
+#define LIB_UNLOCK(nal,flags)                                   \
+        pthread_mutex_unlock(&(nal)->libnal_ni.ni_mutex)
+#endif
 
-#define state_unlock(nal,flagsp)                        \
-{                                                       \
-        CDEBUG(D_PORTALS, "releasing state lock\n");    \
-        nal->cb_sti(nal, flagsp);                       \
-}
 
 #ifdef PTL_USE_LIB_FREELIST
 
@@ -48,13 +51,13 @@ do {                                                    \
 #define MAX_MSGS        2048    /* Outstanding messages */
 #define MAX_EQS         512
 
-extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize);
-extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl);
+extern int lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int nobj, int objsize);
+extern void lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl);
 
 static inline void *
 lib_freelist_alloc (lib_freelist_t *fl)
 {
-        /* ALWAYS called with statelock held */
+        /* ALWAYS called with liblock held */
         lib_freeobj_t *o;
 
         if (list_empty (&fl->fl_list))
@@ -68,7 +71,7 @@ lib_freelist_alloc (lib_freelist_t *fl)
 static inline void
 lib_freelist_free (lib_freelist_t *fl, void *obj)
 {
-        /* ALWAYS called with statelock held */
+        /* ALWAYS called with liblock held */
         lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents);
         
         list_add (&o->fo_list, &fl->fl_list);
@@ -76,78 +79,78 @@ lib_freelist_free (lib_freelist_t *fl, void *obj)
 
 
 static inline lib_eq_t *
-lib_eq_alloc (nal_cb_t *nal)
+lib_eq_alloc (lib_nal_t *nal)
 {
-        /* NEVER called with statelock held */
+        /* NEVER called with liblock held */
         unsigned long  flags;
         lib_eq_t      *eq;
         
-        state_lock (nal, &flags);
-        eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs);
-        state_unlock (nal, &flags);
+        LIB_LOCK (nal, flags);
+        eq = (lib_eq_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_eqs);
+        LIB_UNLOCK (nal, flags);
 
         return (eq);
 }
 
 static inline void
-lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+lib_eq_free (lib_nal_t *nal, lib_eq_t *eq)
 {
-        /* ALWAYS called with statelock held */
-        lib_freelist_free (&nal->ni.ni_free_eqs, eq);
+        /* ALWAYS called with liblock held */
+        lib_freelist_free (&nal->libnal_ni.ni_free_eqs, eq);
 }
 
 static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
+lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd)
 {
-        /* NEVER called with statelock held */
+        /* NEVER called with liblock held */
         unsigned long  flags;
         lib_md_t      *md;
         
-        state_lock (nal, &flags);
-        md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds);
-        state_unlock (nal, &flags);
+        LIB_LOCK (nal, flags);
+        md = (lib_md_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mds);
+        LIB_UNLOCK (nal, flags);
 
         return (md);
 }
 
 static inline void
-lib_md_free (nal_cb_t *nal, lib_md_t *md)
+lib_md_free (lib_nal_t *nal, lib_md_t *md)
 {
-        /* ALWAYS called with statelock held */
-        lib_freelist_free (&nal->ni.ni_free_mds, md);
+        /* ALWAYS called with liblock held */
+        lib_freelist_free (&nal->libnal_ni.ni_free_mds, md);
 }
 
 static inline lib_me_t *
-lib_me_alloc (nal_cb_t *nal)
+lib_me_alloc (lib_nal_t *nal)
 {
-        /* NEVER called with statelock held */
+        /* NEVER called with liblock held */
         unsigned long  flags;
         lib_me_t      *me;
         
-        state_lock (nal, &flags);
-        me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes);
-        state_unlock (nal, &flags);
+        LIB_LOCK (nal, flags);
+        me = (lib_me_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mes);
+        LIB_UNLOCK (nal, flags);
         
         return (me);
 }
 
 static inline void
-lib_me_free (nal_cb_t *nal, lib_me_t *me)
+lib_me_free (lib_nal_t *nal, lib_me_t *me)
 {
-        /* ALWAYS called with statelock held */
-        lib_freelist_free (&nal->ni.ni_free_mes, me);
+        /* ALWAYS called with liblock held */
+        lib_freelist_free (&nal->libnal_ni.ni_free_mes, me);
 }
 
 static inline lib_msg_t *
-lib_msg_alloc (nal_cb_t *nal)
+lib_msg_alloc (lib_nal_t *nal)
 {
-        /* NEVER called with statelock held */
+        /* NEVER called with liblock held */
         unsigned long  flags;
         lib_msg_t     *msg;
         
-        state_lock (nal, &flags);
-        msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs);
-        state_unlock (nal, &flags);
+        LIB_LOCK (nal, flags);
+        msg = (lib_msg_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_msgs);
+        LIB_UNLOCK (nal, flags);
 
         if (msg != NULL) {
                 /* NULL pointers, clear flags etc */
@@ -158,18 +161,18 @@ lib_msg_alloc (nal_cb_t *nal)
 }
 
 static inline void
-lib_msg_free (nal_cb_t *nal, lib_msg_t *msg)
+lib_msg_free (lib_nal_t *nal, lib_msg_t *msg)
 {
-        /* ALWAYS called with statelock held */
-        lib_freelist_free (&nal->ni.ni_free_msgs, msg);
+        /* ALWAYS called with liblock held */
+        lib_freelist_free (&nal->libnal_ni.ni_free_msgs, msg);
 }
 
 #else
 
 static inline lib_eq_t *
-lib_eq_alloc (nal_cb_t *nal)
+lib_eq_alloc (lib_nal_t *nal)
 {
-        /* NEVER called with statelock held */
+        /* NEVER called with liblock held */
         lib_eq_t *eq;
 
         PORTAL_ALLOC(eq, sizeof(*eq));
@@ -177,26 +180,26 @@ lib_eq_alloc (nal_cb_t *nal)
 }
 
 static inline void
-lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+lib_eq_free (lib_nal_t *nal, lib_eq_t *eq)
 {
-        /* ALWAYS called with statelock held */
+        /* ALWAYS called with liblock held */
         PORTAL_FREE(eq, sizeof(*eq));
 }
 
 static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
+lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd)
 {
-        /* NEVER called with statelock held */
+        /* NEVER called with liblock held */
         lib_md_t *md;
         int       size;
         int       niov;
 
         if ((umd->options & PTL_MD_KIOV) != 0) {
-                niov = umd->niov;
+                niov = umd->length;
                 size = offsetof(lib_md_t, md_iov.kiov[niov]);
         } else {
-                niov = ((umd->options & PTL_MD_IOV) != 0) ?
-                       umd->niov : 1;
+                niov = ((umd->options & PTL_MD_IOVEC) != 0) ?
+                       umd->length : 1;
                 size = offsetof(lib_md_t, md_iov.iov[niov]);
         }
 
@@ -212,9 +215,9 @@ lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
 }
 
 static inline void 
-lib_md_free (nal_cb_t *nal, lib_md_t *md)
+lib_md_free (lib_nal_t *nal, lib_md_t *md)
 {
-        /* ALWAYS called with statelock held */
+        /* ALWAYS called with liblock held */
         int       size;
 
         if ((md->options & PTL_MD_KIOV) != 0)
@@ -226,9 +229,9 @@ lib_md_free (nal_cb_t *nal, lib_md_t *md)
 }
 
 static inline lib_me_t *
-lib_me_alloc (nal_cb_t *nal)
+lib_me_alloc (lib_nal_t *nal)
 {
-        /* NEVER called with statelock held */
+        /* NEVER called with liblock held */
         lib_me_t *me;
 
         PORTAL_ALLOC(me, sizeof(*me));
@@ -236,16 +239,16 @@ lib_me_alloc (nal_cb_t *nal)
 }
 
 static inline void 
-lib_me_free(nal_cb_t *nal, lib_me_t *me)
+lib_me_free(lib_nal_t *nal, lib_me_t *me)
 {
-        /* ALWAYS called with statelock held */
+        /* ALWAYS called with liblock held */
         PORTAL_FREE(me, sizeof(*me));
 }
 
 static inline lib_msg_t *
-lib_msg_alloc(nal_cb_t *nal)
+lib_msg_alloc(lib_nal_t *nal)
 {
-        /* NEVER called with statelock held; may be in interrupt... */
+        /* NEVER called with liblock held; may be in interrupt... */
         lib_msg_t *msg;
 
         if (in_interrupt())
@@ -262,27 +265,28 @@ lib_msg_alloc(nal_cb_t *nal)
 }
 
 static inline void 
-lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
+lib_msg_free(lib_nal_t *nal, lib_msg_t *msg)
 {
-        /* ALWAYS called with statelock held */
+        /* ALWAYS called with liblock held */
         PORTAL_FREE(msg, sizeof(*msg));
 }
 #endif
 
-extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type);
-extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type);
-extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh);
+extern lib_handle_t *lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type);
+extern void lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type);
+extern void lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh);
 
 static inline void
-ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq)
+ptl_eq2handle (ptl_handle_eq_t *handle, lib_nal_t *nal, lib_eq_t *eq)
 {
+        handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx;
         handle->cookie = eq->eq_lh.lh_cookie;
 }
 
 static inline lib_eq_t *
-ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal)
+ptl_handle2eq (ptl_handle_eq_t *handle, lib_nal_t *nal)
 {
-        /* ALWAYS called with statelock held */
+        /* ALWAYS called with liblock held */
         lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, 
                                               PTL_COOKIE_TYPE_EQ);
         if (lh == NULL)
@@ -292,15 +296,16 @@ ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal)
 }
 
 static inline void
-ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md)
+ptl_md2handle (ptl_handle_md_t *handle, lib_nal_t *nal, lib_md_t *md)
 {
+        handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx;
         handle->cookie = md->md_lh.lh_cookie;
 }
 
 static inline lib_md_t *
-ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal)
+ptl_handle2md (ptl_handle_md_t *handle, lib_nal_t *nal)
 {
-        /* ALWAYS called with statelock held */
+        /* ALWAYS called with liblock held */
         lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
                                               PTL_COOKIE_TYPE_MD);
         if (lh == NULL)
@@ -310,12 +315,12 @@ ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal)
 }
 
 static inline lib_md_t *
-ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal)
+ptl_wire_handle2md (ptl_handle_wire_t *wh, lib_nal_t *nal)
 {
-        /* ALWAYS called with statelock held */
+        /* ALWAYS called with liblock held */
         lib_handle_t *lh;
         
-        if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie)
+        if (wh->wh_interface_cookie != nal->libnal_ni.ni_interface_cookie)
                 return (NULL);
         
         lh = lib_lookup_cookie (nal, wh->wh_object_cookie,
@@ -327,15 +332,16 @@ ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal)
 }
 
 static inline void
-ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me)
+ptl_me2handle (ptl_handle_me_t *handle, lib_nal_t *nal, lib_me_t *me)
 {
+        handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx;
         handle->cookie = me->me_lh.lh_cookie;
 }
 
 static inline lib_me_t *
-ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal)
+ptl_handle2me (ptl_handle_me_t *handle, lib_nal_t *nal)
 {
-        /* ALWAYS called with statelock held */
+        /* ALWAYS called with liblock held */
         lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
                                               PTL_COOKIE_TYPE_ME);
         if (lh == NULL)
@@ -344,34 +350,30 @@ ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal)
         return (lh_entry (lh, lib_me_t, me_lh));
 }
 
-extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize,
-                    ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size);
-extern int lib_fini(nal_cb_t * cb);
-extern void lib_dispatch(nal_cb_t * cb, void *private, int index,
-                         void *arg_block, void *ret_block);
-extern char *dispatch_name(int index);
+extern int lib_init(lib_nal_t *libnal, nal_t *apinal,
+                    ptl_process_id_t pid,
+                    ptl_ni_limits_t *desired_limits, 
+                    ptl_ni_limits_t *actual_limits);
+extern int lib_fini(lib_nal_t *libnal);
 
 /*
- * When the NAL detects an incoming message, it should call
- * lib_parse() decode it.  The NAL callbacks will be handed
- * the private cookie as a way for the NAL to maintain state
- * about which transaction is being processed.  An extra parameter,
- * lib_cookie will contain the necessary information for
- * finalizing the message.
- *
- * After it has finished the handling the message, it should
- * call lib_finalize() with the lib_cookie parameter.
- * Call backs will be made to write events, send acks or
- * replies and so on.
+ * When the NAL detects an incoming message header, it should call
+ * lib_parse() decode it.  If the message header is garbage, lib_parse()
+ * returns immediately with failure, otherwise the NAL callbacks will be
+ * called to receive the message body.  They are handed the private cookie
+ * as a way for the NAL to maintain state about which transaction is being
+ * processed.  An extra parameter, lib_msg contains the lib-level message
+ * state for passing to lib_finalize() when the message body has been
+ * received.
  */
-extern void lib_enq_event_locked (nal_cb_t *nal, void *private,
+extern void lib_enq_event_locked (lib_nal_t *nal, void *private,
                                   lib_eq_t *eq, ptl_event_t *ev);
-extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, 
-                          ptl_err_t status);
-extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private);
-extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, 
-                                      lib_md_t *getmd);
-extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr);
+extern void lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, 
+                          ptl_ni_fail_t ni_fail_type);
+extern ptl_err_t lib_parse (lib_nal_t *nal, ptl_hdr_t *hdr, void *private);
+extern lib_msg_t *lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, 
+                                        lib_msg_t *get_msg);
+extern void print_hdr (lib_nal_t * nal, ptl_hdr_t * hdr);
 
 
 extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
@@ -394,14 +396,65 @@ extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
 
 extern void lib_assert_wire_constants (void);
 
-extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+extern ptl_err_t lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
                            ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
-extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+extern ptl_err_t lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg,
                            ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                            lib_md_t *md, ptl_size_t offset, ptl_size_t len);
 
-extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
-                               ptl_md_t * md_out);
-extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in);
-extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in);
+extern int lib_api_ni_status (nal_t *nal, ptl_sr_index_t sr_idx,
+                              ptl_sr_value_t *status);
+extern int lib_api_ni_dist (nal_t *nal, ptl_process_id_t *pid, 
+                            unsigned long *dist);
+
+extern int lib_api_eq_alloc (nal_t *nal, ptl_size_t count,
+                             ptl_eq_handler_t callback, 
+                             ptl_handle_eq_t *handle);
+extern int lib_api_eq_free(nal_t *nal, ptl_handle_eq_t *eqh);
+extern int lib_api_eq_poll (nal_t *nal, 
+                            ptl_handle_eq_t *eventqs, int neq, int timeout_ms,
+                            ptl_event_t *event, int *which);
+
+extern int lib_api_me_attach(nal_t *nal,
+                             ptl_pt_index_t portal,
+                             ptl_process_id_t match_id, 
+                             ptl_match_bits_t match_bits, 
+                             ptl_match_bits_t ignore_bits,
+                             ptl_unlink_t unlink, ptl_ins_pos_t pos,
+                             ptl_handle_me_t *handle);
+extern int lib_api_me_insert(nal_t *nal,
+                             ptl_handle_me_t *current_meh,
+                             ptl_process_id_t match_id, 
+                             ptl_match_bits_t match_bits, 
+                             ptl_match_bits_t ignore_bits,
+                             ptl_unlink_t unlink, ptl_ins_pos_t pos,
+                             ptl_handle_me_t *handle);
+extern int lib_api_me_unlink (nal_t *nal, ptl_handle_me_t *meh);
+extern void lib_me_unlink(lib_nal_t *nal, lib_me_t *me);
+
+extern int lib_api_get_id(nal_t *nal, ptl_process_id_t *pid);
+
+extern void lib_md_unlink(lib_nal_t *nal, lib_md_t *md);
+extern void lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd);
+extern int lib_api_md_attach(nal_t *nal, ptl_handle_me_t *meh,
+                             ptl_md_t *umd, ptl_unlink_t unlink, 
+                             ptl_handle_md_t *handle);
+extern int lib_api_md_bind(nal_t *nal, ptl_md_t *umd, ptl_unlink_t unlink,
+                           ptl_handle_md_t *handle);
+extern int lib_api_md_unlink (nal_t *nal, ptl_handle_md_t *mdh);
+extern int lib_api_md_update (nal_t *nal, ptl_handle_md_t *mdh,
+                              ptl_md_t *oldumd, ptl_md_t *newumd,
+                              ptl_handle_eq_t *testqh);
+
+extern int lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, 
+                       ptl_process_id_t *id,
+                       ptl_pt_index_t portal, ptl_ac_index_t ac,
+                       ptl_match_bits_t match_bits, ptl_size_t offset);
+extern int lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, 
+                       ptl_ack_req_t ack, ptl_process_id_t *id,
+                       ptl_pt_index_t portal, ptl_ac_index_t ac,
+                       ptl_match_bits_t match_bits, 
+                       ptl_size_t offset, ptl_hdr_data_t hdr_data);
+extern int lib_api_fail_nid(nal_t *apinal, ptl_nid_t nid, unsigned int threshold);
+
 #endif
index d05d3fa..cfcef2b 100644 (file)
 #ifndef _LIB_TYPES_H_
 #define _LIB_TYPES_H_
 
+#include "build_check.h"
+
 #include <portals/types.h>
+#include <portals/nal.h>
 #ifdef __KERNEL__
 # include <linux/uio.h>
 # include <linux/smp_lock.h>
@@ -20,9 +23,6 @@
 # include <sys/types.h>
 #endif
 
-/* struct nal_cb_t is defined in lib-nal.h */
-typedef struct nal_cb_t nal_cb_t;
-
 typedef char *user_ptr;
 typedef struct lib_msg_t lib_msg_t;
 typedef struct lib_ptl_t lib_ptl_t;
@@ -124,8 +124,8 @@ typedef struct {
 
 #define PORTALS_PROTO_MAGIC                0xeebc0ded
 
-#define PORTALS_PROTO_VERSION_MAJOR        0
-#define PORTALS_PROTO_VERSION_MINOR        3
+#define PORTALS_PROTO_VERSION_MAJOR        1
+#define PORTALS_PROTO_VERSION_MINOR        0
 
 typedef struct {
         long recv_count, recv_length, send_count, send_length, drop_count,
@@ -133,8 +133,8 @@ typedef struct {
 } lib_counters_t;
 
 /* temporary expedient: limit number of entries in discontiguous MDs */
-#define PTL_MTU        (512<<10)
-#define PTL_MD_MAX_IOV 128
+#define PTL_MTU        (1<<20)
+#define PTL_MD_MAX_IOV 256
 
 struct lib_msg_t {
         struct list_head  msg_list;
@@ -163,11 +163,12 @@ typedef struct {
 struct lib_eq_t {
         struct list_head  eq_list;
         lib_handle_t      eq_lh;
-        ptl_seq_t         sequence;
-        ptl_size_t        size;
-        ptl_event_t      *base;
+        ptl_seq_t         eq_enq_seq;
+        ptl_seq_t         eq_deq_seq;
+        ptl_size_t        eq_size;
+        ptl_event_t      *eq_events;
         int               eq_refcount;
-        int (*event_callback) (ptl_event_t * event);
+        ptl_eq_handler_t  eq_callback;
         void             *eq_addrkey;
 };
 
@@ -190,7 +191,6 @@ struct lib_md_t {
         ptl_size_t        max_size;
         int               threshold;
         int               pending;
-        ptl_unlink_t      unlink;
         unsigned int      options;
         unsigned int      md_flags;
         void             *user_ptr;
@@ -203,7 +203,15 @@ struct lib_md_t {
         } md_iov;
 };
 
-#define PTL_MD_FLAG_UNLINK            (1 << 0)
+#define PTL_MD_FLAG_ZOMBIE            (1 << 0)
+#define PTL_MD_FLAG_AUTO_UNLINK       (1 << 1)
+
+static inline int lib_md_exhausted (lib_md_t *md) 
+{
+        return (md->threshold == 0 ||
+                ((md->options & PTL_MD_MAX_SIZE) != 0 &&
+                 md->offset + md->max_size > md->length));
+}
 
 #ifdef PTL_USE_LIB_FREELIST
 typedef struct
@@ -235,33 +243,117 @@ typedef struct {
 /* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be
  * extracted by masking with (PTL_COOKIE_TYPES - 1) */
 
-typedef struct {
-        int up;
-        int refcnt;
-        ptl_nid_t nid;
-        ptl_pid_t pid;
-        int num_nodes;
-        unsigned int debug;
-        lib_ptl_t tbl;
-        lib_ac_t ac;
-        lib_counters_t counters;
+typedef struct lib_ni 
+{
+        nal_t            *ni_api;
+        ptl_process_id_t  ni_pid;
+        lib_ptl_t         ni_portals;
+        lib_counters_t    ni_counters;
+        ptl_ni_limits_t   ni_actual_limits;
 
         int               ni_lh_hash_size;      /* size of lib handle hash table */
         struct list_head *ni_lh_hash_table;     /* all extant lib handles, this interface */
         __u64             ni_next_object_cookie; /* cookie generator */
         __u64             ni_interface_cookie;  /* uniquely identifies this ni in this epoch */
         
-        struct list_head ni_test_peers;
+        struct list_head  ni_test_peers;
         
 #ifdef PTL_USE_LIB_FREELIST
-        lib_freelist_t   ni_free_mes;
-        lib_freelist_t   ni_free_msgs;
-        lib_freelist_t   ni_free_mds;
-        lib_freelist_t   ni_free_eqs;
+        lib_freelist_t    ni_free_mes;
+        lib_freelist_t    ni_free_msgs;
+        lib_freelist_t    ni_free_mds;
+        lib_freelist_t    ni_free_eqs;
+#endif
+
+        struct list_head  ni_active_msgs;
+        struct list_head  ni_active_mds;
+        struct list_head  ni_active_eqs;
+
+#ifdef __KERNEL__
+        spinlock_t        ni_lock;
+        wait_queue_head_t ni_waitq;
+#else
+        pthread_mutex_t   ni_mutex;
+        pthread_cond_t    ni_cond;
 #endif
-        struct list_head ni_active_msgs;
-        struct list_head ni_active_mds;
-        struct list_head ni_active_eqs;
 } lib_ni_t;
 
+
+typedef struct lib_nal
+{
+       /* lib-level interface state */
+       lib_ni_t libnal_ni;
+
+       /* NAL-private data */
+       void *libnal_data;
+
+       /*
+        * send: Sends a preformatted header and payload data to a
+        * specified remote process. The payload is scattered over 'niov'
+        * fragments described by iov, starting at 'offset' for 'mlen'
+        * bytes.  
+        * NB the NAL may NOT overwrite iov.  
+        * PTL_OK on success => NAL has committed to send and will call
+        * lib_finalize on completion
+        */
+       ptl_err_t (*libnal_send) 
+                (struct lib_nal *nal, void *private, lib_msg_t *cookie, 
+                 ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                 unsigned int niov, struct iovec *iov, 
+                 size_t offset, size_t mlen);
+        
+       /* as send, but with a set of page fragments (NULL if not supported) */
+       ptl_err_t (*libnal_send_pages)
+                (struct lib_nal *nal, void *private, lib_msg_t * cookie, 
+                 ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                 unsigned int niov, ptl_kiov_t *iov, 
+                 size_t offset, size_t mlen);
+       /*
+        * recv: Receives an incoming message from a remote process.  The
+        * payload is to be received into the scattered buffer of 'niov'
+        * fragments described by iov, starting at 'offset' for 'mlen'
+        * bytes.  Payload bytes after 'mlen' up to 'rlen' are to be
+        * discarded.  
+        * NB the NAL may NOT overwrite iov.
+        * PTL_OK on success => NAL has committed to receive and will call
+        * lib_finalize on completion
+        */
+       ptl_err_t (*libnal_recv) 
+                (struct lib_nal *nal, void *private, lib_msg_t * cookie,
+                 unsigned int niov, struct iovec *iov, 
+                 size_t offset, size_t mlen, size_t rlen);
+
+       /* as recv, but with a set of page fragments (NULL if not supported) */
+       ptl_err_t (*libnal_recv_pages) 
+                (struct lib_nal *nal, void *private, lib_msg_t * cookie,
+                 unsigned int niov, ptl_kiov_t *iov, 
+                 size_t offset, size_t mlen, size_t rlen);
+
+       /*
+        * (un)map: Tell the NAL about some memory it will access.
+        * *addrkey passed to libnal_unmap() is what libnal_map() set it to.
+        * type of *iov depends on options.
+        * Set to NULL if not required.
+        */
+       ptl_err_t (*libnal_map)
+                (struct lib_nal *nal, unsigned int niov, struct iovec *iov, 
+                 void **addrkey);
+       void (*libnal_unmap)
+                (struct lib_nal *nal, unsigned int niov, struct iovec *iov, 
+                 void **addrkey);
+
+       /* as (un)map, but with a set of page fragments */
+       ptl_err_t (*libnal_map_pages)
+                (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, 
+                 void **addrkey);
+       void (*libnal_unmap_pages)
+                (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, 
+                 void **addrkey);
+
+       void (*libnal_printf)(struct lib_nal *nal, const char *fmt, ...);
+
+       /* Calculate a network "distance" to given node */
+       int (*libnal_dist) (struct lib_nal *nal, ptl_nid_t nid, unsigned long *dist);
+} lib_nal_t;
+
 #endif
index 7cb3ab7..bf86569 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _NAL_H_
 #define _NAL_H_
 
+#include "build_check.h"
+
 /*
  * p30/nal.h
  *
 
 #include <portals/types.h>
 
-#ifdef yield
-#undef yield
-#endif
-
 typedef struct nal_t nal_t;
 
 struct nal_t {
-       ptl_ni_t ni;
-       int refct;
-       void *nal_data;
-       int *timeout;           /* for libp30api users */
-       int (*forward) (nal_t * nal, int index, /* Function ID */
-                       void *args, size_t arg_len, void *ret, size_t ret_len);
+       /* common interface state */
+       int              nal_refct;
+        ptl_handle_ni_t  nal_handle;
 
-       int (*shutdown) (nal_t * nal, int interface);
+       /* NAL-private data */
+       void            *nal_data;
 
-       int (*validate) (nal_t * nal, void *base, size_t extent);
+       /* NAL API implementation 
+        * NB only nal_ni_init needs to be set when the NAL registers itself */
+       int (*nal_ni_init) (nal_t *nal, ptl_pid_t requested_pid,
+                           ptl_ni_limits_t *req, ptl_ni_limits_t *actual);
+       
+       void (*nal_ni_fini) (nal_t *nal);
 
-       void (*yield) (nal_t * nal);
+       int (*nal_get_id) (nal_t *nal, ptl_process_id_t *id);
+       int (*nal_ni_status) (nal_t *nal, ptl_sr_index_t register, ptl_sr_value_t *status);
+       int (*nal_ni_dist) (nal_t *nal, ptl_process_id_t *id, unsigned long *distance);
+       int (*nal_fail_nid) (nal_t *nal, ptl_nid_t nid, unsigned int threshold);
 
-       void (*lock) (nal_t * nal, unsigned long *flags);
+       int (*nal_me_attach) (nal_t *nal, ptl_pt_index_t portal,
+                             ptl_process_id_t match_id, 
+                             ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits,
+                             ptl_unlink_t unlink, ptl_ins_pos_t pos, 
+                             ptl_handle_me_t *handle);
+       int (*nal_me_insert) (nal_t *nal, ptl_handle_me_t *me,
+                             ptl_process_id_t match_id, 
+                             ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits,
+                             ptl_unlink_t unlink, ptl_ins_pos_t pos, 
+                             ptl_handle_me_t *handle);
+       int (*nal_me_unlink) (nal_t *nal, ptl_handle_me_t *me);
+       
+       int (*nal_md_attach) (nal_t *nal, ptl_handle_me_t *me,
+                             ptl_md_t *md, ptl_unlink_t unlink, 
+                             ptl_handle_md_t *handle);
+       int (*nal_md_bind) (nal_t *nal, 
+                           ptl_md_t *md, ptl_unlink_t unlink, 
+                           ptl_handle_md_t *handle);
+       int (*nal_md_unlink) (nal_t *nal, ptl_handle_md_t *md);
+       int (*nal_md_update) (nal_t *nal, ptl_handle_md_t *md,
+                             ptl_md_t *old_md, ptl_md_t *new_md,
+                             ptl_handle_eq_t *testq);
 
-       void (*unlock) (nal_t * nal, unsigned long *flags);
-};
+       int (*nal_eq_alloc) (nal_t *nal, ptl_size_t count,
+                            ptl_eq_handler_t handler,
+                            ptl_handle_eq_t *handle);
+       int (*nal_eq_free) (nal_t *nal, ptl_handle_eq_t *eq);
+       int (*nal_eq_poll) (nal_t *nal, 
+                           ptl_handle_eq_t *eqs, int neqs, int timeout,
+                           ptl_event_t *event, int *which);
 
-typedef nal_t *(ptl_interface_t) (int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
-extern nal_t *PTL_IFACE_IP(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
-extern nal_t *PTL_IFACE_MYR(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+       int (*nal_ace_entry) (nal_t *nal, ptl_ac_index_t index,
+                             ptl_process_id_t match_id, ptl_pt_index_t portal);
+       
+       int (*nal_put) (nal_t *nal, ptl_handle_md_t *md, ptl_ack_req_t ack,
+                       ptl_process_id_t *target, ptl_pt_index_t portal,
+                       ptl_ac_index_t ac, ptl_match_bits_t match,
+                       ptl_size_t offset, ptl_hdr_data_t hdr_data);
+       int (*nal_get) (nal_t *nal, ptl_handle_md_t *md,
+                       ptl_process_id_t *target, ptl_pt_index_t portal,
+                       ptl_ac_index_t ac, ptl_match_bits_t match,
+                       ptl_size_t offset);
+};
 
-extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any);
+extern nal_t *ptl_hndl2nal(ptl_handle_any_t *any);
 
-#ifndef PTL_IFACE_DEFAULT
-#define PTL_IFACE_DEFAULT (PTL_IFACE_IP)
+#ifdef __KERNEL__
+extern int ptl_register_nal(ptl_interface_t interface, nal_t *nal);
+extern void ptl_unregister_nal(ptl_interface_t interface);
 #endif
 
 #endif
index 1b837b4..55a991b 100644 (file)
@@ -1,4 +1,2 @@
-#define PTL_IFACE_TCP 1
-#define PTL_IFACE_ER 2
-#define PTL_IFACE_SS 3
-#define PTL_IFACE_MAX 4
+#include "build_check.h"
+
index 8b1495e..4b8631d 100644 (file)
@@ -4,6 +4,8 @@
 #ifndef _P30_H_
 #define _P30_H_
 
+#include "build_check.h"
+
 /*
  * p30.h
  *
 #endif
 
 #include <portals/types.h>
-#include <portals/nal.h>
 #include <portals/api.h>
-#include <portals/nalids.h>
-
-extern int __p30_initialized;  /* for libraries & test codes  */
-extern int __p30_myr_initialized;      /*   that don't know if p30    */
-extern int __p30_ip_initialized;       /*   had been initialized yet  */
-extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle;
-
-extern int __p30_myr_timeout;  /* in seconds, for PtlNIBarrier,     */
-extern int __p30_ip_timeout;   /* PtlReduce_all, & PtlBroadcast_all */
-
-/*
- * Debugging flags reserved for the Portals reference library.
- * These are not part of the API as described in the SAND report
- * but are for the use of the maintainers of the reference implementation.
- *
- * It is not expected that the real implementations will export
- * this functionality.
- */
-#define PTL_DEBUG_NONE          0ul
-#define PTL_DEBUG_ALL           (0x0FFFul)     /* Only the Portals flags */
-
-#define __bit(x)                ((unsigned long) 1<<(x))
-#define PTL_DEBUG_PUT           __bit(0)
-#define PTL_DEBUG_GET           __bit(1)
-#define PTL_DEBUG_REPLY         __bit(2)
-#define PTL_DEBUG_ACK           __bit(3)
-#define PTL_DEBUG_DROP          __bit(4)
-#define PTL_DEBUG_REQUEST       __bit(5)
-#define PTL_DEBUG_DELIVERY      __bit(6)
-#define PTL_DEBUG_UNLINK        __bit(7)
-#define PTL_DEBUG_THRESHOLD     __bit(8)
-#define PTL_DEBUG_API           __bit(9)
-
-/*
- * These eight are reserved for the NAL to define
- * It should probably give them better names...
- */
-#define PTL_DEBUG_NI_ALL        (0xF000ul)     /* Only the NAL flags */
-#define PTL_DEBUG_NI0           __bit(24)
-#define PTL_DEBUG_NI1           __bit(25)
-#define PTL_DEBUG_NI2           __bit(26)
-#define PTL_DEBUG_NI3           __bit(27)
-#define PTL_DEBUG_NI4           __bit(28)
-#define PTL_DEBUG_NI5           __bit(29)
-#define PTL_DEBUG_NI6           __bit(30)
-#define PTL_DEBUG_NI7           __bit(31)
 
 #endif
diff --git a/lustre/portals/include/portals/ppid.h b/lustre/portals/include/portals/ppid.h
deleted file mode 100644 (file)
index 760f465..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef _INCppidh_
-#define _INCppidh_
-
-#include "defines.h"
-// #include "idtypes.h"
-
-
-#define MAX_PPID         1000    /* this needs to fit into 16 bits so the 
-                                    maximum value is 65535. having it "large"
-                                    can help w/ debugging process accounting
-                                    but there are reasons for making it 
-                                    somewhat smaller than the maximum --
-                                    requiring storage for arrays that index 
-                                    on the ppid, eg...  */
-                                 
-#define MAX_GID          1000    /* this needs to fit into 16 bits... */
-
-#define MAX_FIXED_PPID   100
-#define MAX_FIXED_GID    100
-#define PPID_FLOATING    MAX_FIXED_PPID+1   /* Floating area starts here */
-#define GID_FLOATING     MAX_FIXED_GID+1    /* Floating area starts here */
-#define NUM_PTL_TASKS    MAX_FIXED_PPID+80  /* Maximum no. portals tasks */
-
-#define PPID_AUTO        0
-
-/* Minimum PPID is 1 */
-#define PPID_BEBOPD      1            /* bebopd */
-#define  GID_BEBOPD      1            /* bebopd */
-
-#define PPID_PCT         2            /* pct */
-#define  GID_PCT         2            /* pct */
-
-#define PPID_FYOD        3            /* fyod */
-#define  GID_FYOD        3            /* fyod */
-
-#define PPID_GDBWRAP     11           /* portals proxy for gdb */
-#define  GID_GDBWRAP     11           /* portals proxy for gdb */
-
-#define PPID_TEST        15           /* for portals tests */
-#define  GID_TEST        15
-
-#define  GID_YOD         5            /* yod */
-#define  GID_PINGD       6            /* pingd */
-#define  GID_BT          7            /* bt */
-#define  GID_PTLTEST     8            /* ptltest */
-#define  GID_CGDB        9            /* cgdb */
-#define  GID_TVDSVR     10            /* start-tvdsvr */
-
-#endif /* _INCppidh_ */
index 12ef47a..cfddde2 100644 (file)
 #ifndef _PTLCTL_H_
 #define _PTLCTL_H_
 
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <linux/libcfs.h>
+
 #define PORTALS_DEV_ID 0
 #define PORTALS_DEV_PATH "/dev/portals"
 #define OBD_DEV_ID 1
@@ -35,9 +39,12 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid);
 
 int ptl_initialize(int argc, char **argv);
 int jt_ptl_network(int argc, char **argv);
-int jt_ptl_print_autoconnects (int argc, char **argv);
-int jt_ptl_add_autoconnect (int argc, char **argv);
-int jt_ptl_del_autoconnect (int argc, char **argv);
+int jt_ptl_print_interfaces(int argc, char **argv);
+int jt_ptl_add_interface(int argc, char **argv);
+int jt_ptl_del_interface(int argc, char **argv);
+int jt_ptl_print_peers (int argc, char **argv);
+int jt_ptl_add_peer (int argc, char **argv);
+int jt_ptl_del_peer (int argc, char **argv);
 int jt_ptl_print_connections (int argc, char **argv);
 int jt_ptl_connect(int argc, char **argv);
 int jt_ptl_disconnect(int argc, char **argv);
@@ -50,9 +57,6 @@ int jt_ptl_add_uuid(int argc, char **argv);
 int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
 int jt_ptl_close_uuid(int argc, char **argv);
 int jt_ptl_del_uuid(int argc, char **argv);
-int jt_ptl_rxmem (int argc, char **argv);
-int jt_ptl_txmem (int argc, char **argv);
-int jt_ptl_nagle (int argc, char **argv);
 int jt_ptl_add_route (int argc, char **argv);
 int jt_ptl_del_route (int argc, char **argv);
 int jt_ptl_notify_router (int argc, char **argv);
@@ -76,13 +80,15 @@ int jt_dbg_panic(int argc, char **argv);
 int ptl_set_cfg_record_cb(cfg_record_cb_t cb);
 
 /* l_ioctl.c */
-typedef int (ioc_handler_t)(int dev_id, int opc, void *buf);
+typedef int (ioc_handler_t)(int dev_id, unsigned int opc, void *buf);
 void set_ioc_handler(ioc_handler_t *handler);
 int register_ioc_dev(int dev_id, const char * dev_name);
 void unregister_ioc_dev(int dev_id);
 int set_ioctl_dump(char * file);
-int l_ioctl(int dev_id, int opc, void *buf);
-int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *));
+int l_ioctl(int dev_id, unsigned int opc, void *buf);
+int parse_dump(char * dump_file, ioc_handler_t ioc_func);
 int jt_ioc_dump(int argc, char **argv);
+extern char *dump_filename;
+int dump(int dev_id, unsigned int opc, void *buf);
 
 #endif
index 80995e9..0bada40 100644 (file)
@@ -1,19 +1,18 @@
 #ifndef _P30_TYPES_H_
 #define _P30_TYPES_H_
 
-#include <asm/types.h>
-
-#ifdef __KERNEL__
-# include <linux/time.h>
-# include <asm/timex.h>
-#else
-# include <sys/time.h>
-# define do_gettimeofday(tv) gettimeofday(tv, NULL);
-typedef unsigned long long cycles_t;
-#endif
+#include "build_check.h"
 
+#include <linux/libcfs.h>
 #include <portals/errno.h>
 
+/* This implementation uses the same type for API function return codes and
+ * the completion status in an event  */
+#define PTL_NI_OK  PTL_OK
+typedef ptl_err_t ptl_ni_fail_t;
+
+typedef __u32 ptl_uid_t;
+typedef __u32 ptl_jid_t;
 typedef __u64 ptl_nid_t;
 typedef __u32 ptl_pid_t;
 typedef __u32 ptl_pt_index_t;
@@ -22,6 +21,8 @@ typedef __u64 ptl_match_bits_t;
 typedef __u64 ptl_hdr_data_t;
 typedef __u32 ptl_size_t;
 
+#define PTL_TIME_FOREVER    (-1)
+
 typedef struct {
         unsigned long nal_idx;                 /* which network interface */
         __u64         cookie;                  /* which thing on that interface */
@@ -32,15 +33,17 @@ typedef ptl_handle_any_t ptl_handle_eq_t;
 typedef ptl_handle_any_t ptl_handle_md_t;
 typedef ptl_handle_any_t ptl_handle_me_t;
 
-#define PTL_HANDLE_NONE \
+#define PTL_INVALID_HANDLE \
     ((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1})
-#define PTL_EQ_NONE PTL_HANDLE_NONE
+#define PTL_EQ_NONE PTL_INVALID_HANDLE
 
-static inline int PtlHandleEqual (ptl_handle_any_t h1, ptl_handle_any_t h2)
+static inline int PtlHandleIsEqual (ptl_handle_any_t h1, ptl_handle_any_t h2)
 {
        return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie);
 }
 
+#define PTL_UID_ANY      ((ptl_uid_t) -1)
+#define PTL_JID_ANY      ((ptl_jid_t) -1)
 #define PTL_NID_ANY      ((ptl_nid_t) -1)
 #define PTL_PID_ANY      ((ptl_pid_t) -1)
 
@@ -60,41 +63,58 @@ typedef enum {
 } ptl_ins_pos_t;
 
 typedef struct {
-       struct page     *kiov_page;
-       unsigned int     kiov_len;
-       unsigned int     kiov_offset;
-} ptl_kiov_t;
-
-typedef struct {
         void            *start;
         ptl_size_t       length;
         int              threshold;
         int              max_size;
         unsigned int     options;
         void            *user_ptr;
-        ptl_handle_eq_t  eventq;
-       unsigned int     niov;
+        ptl_handle_eq_t  eq_handle;
 } ptl_md_t;
 
 /* Options for the MD structure */
-#define PTL_MD_OP_PUT           (1 << 0)
-#define PTL_MD_OP_GET           (1 << 1)
-#define PTL_MD_MANAGE_REMOTE    (1 << 2)
-#define PTL_MD_AUTO_UNLINK      (1 << 3)
-#define PTL_MD_TRUNCATE         (1 << 4)
-#define PTL_MD_ACK_DISABLE      (1 << 5)
-#define PTL_MD_IOV             (1 << 6)
-#define PTL_MD_MAX_SIZE                (1 << 7)
-#define PTL_MD_KIOV             (1 << 8)
+#define PTL_MD_OP_PUT               (1 << 0)
+#define PTL_MD_OP_GET               (1 << 1)
+#define PTL_MD_MANAGE_REMOTE        (1 << 2)
+/* unused                           (1 << 3) */
+#define PTL_MD_TRUNCATE             (1 << 4)
+#define PTL_MD_ACK_DISABLE          (1 << 5)
+#define PTL_MD_IOVEC               (1 << 6)
+#define PTL_MD_MAX_SIZE                    (1 << 7)
+#define PTL_MD_KIOV                 (1 << 8)
+#define PTL_MD_EVENT_START_DISABLE  (1 << 9)
+#define PTL_MD_EVENT_END_DISABLE    (1 << 10)
+
+/* For compatibility with Cray Portals */
+#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS  0
+#define PTL_MD_PHYS                         0
 
 #define PTL_MD_THRESH_INF       (-1)
 
+/* NB lustre portals uses struct iovec internally! */
+typedef struct iovec ptl_md_iovec_t;
+
+typedef struct {
+       struct page     *kiov_page;
+       unsigned int     kiov_len;
+       unsigned int     kiov_offset;
+} ptl_kiov_t;
+
 typedef enum {
-        PTL_EVENT_GET,
-        PTL_EVENT_PUT,
-        PTL_EVENT_REPLY,
+        PTL_EVENT_GET_START,
+        PTL_EVENT_GET_END,
+
+        PTL_EVENT_PUT_START,
+        PTL_EVENT_PUT_END,
+
+        PTL_EVENT_REPLY_START,
+        PTL_EVENT_REPLY_END,
+
         PTL_EVENT_ACK,
-        PTL_EVENT_SENT,
+
+        PTL_EVENT_SEND_START,
+       PTL_EVENT_SEND_END,
+
        PTL_EVENT_UNLINK,
 } ptl_event_kind_t;
 
@@ -111,17 +131,21 @@ typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t;
 #endif
 typedef struct {
         ptl_event_kind_t   type;
-       ptl_err_t          status;
-       int                unlinked;
         ptl_process_id_t   initiator;
-        ptl_pt_index_t     portal;
+        ptl_uid_t          uid;
+        ptl_jid_t          jid;
+        ptl_pt_index_t     pt_index;
         ptl_match_bits_t   match_bits;
         ptl_size_t         rlength;
-       ptl_size_t         mlength;
-       ptl_size_t         offset;
-        ptl_md_t           mem_desc;
+        ptl_size_t         mlength;
+        ptl_size_t         offset;
+        ptl_handle_md_t    md_handle;
+        ptl_md_t           md;
         ptl_hdr_data_t     hdr_data;
-        struct timeval     arrival_time;
+        ptl_seq_t          link;
+        ptl_ni_fail_t      ni_fail_type;
+
+        int                unlinked;
 
         volatile ptl_seq_t sequence;
 } ptl_event_t;
@@ -134,23 +158,18 @@ typedef enum {
         PTL_NOACK_REQ
 } ptl_ack_req_t;
 
-typedef struct {
-        volatile ptl_seq_t sequence;
-        ptl_size_t size;
-        ptl_event_t *base;
-        ptl_handle_any_t cb_eq_handle;
-} ptl_eq_t;
-
-typedef struct {
-        ptl_eq_t *eq;
-} ptl_ni_t;
+typedef void (*ptl_eq_handler_t)(ptl_event_t *event);
+#define PTL_EQ_HANDLER_NONE NULL
 
 typedef struct {
-        int max_match_entries;    /* max number of match entries */
-        int max_mem_descriptors;  /* max number of memory descriptors */
-        int max_event_queues;     /* max number of event queues */
-        int max_atable_index;     /* maximum access control list table index */
-        int max_ptable_index;     /* maximum portals table index */
+       int max_mes;
+       int max_mds;
+       int max_eqs;
+       int max_ac_index;
+       int max_pt_index;
+       int max_md_iovecs;
+       int max_me_list;
+       int max_getput_md;
 } ptl_ni_limits_t;
 
 /*
@@ -168,4 +187,7 @@ typedef enum {
 
 typedef int ptl_sr_value_t;
 
+typedef int ptl_interface_t;
+#define PTL_IFACE_DEFAULT    (-1)
+
 #endif
index b5ed168..9763d14 100644 (file)
@@ -1,5 +1,6 @@
 @BUILD_GMNAL_TRUE@subdir-m += gmnal
-@BUILD_IBNAL_TRUE@subdir-m += ibnal
+@BUILD_OPENIBNAL_TRUE@subdir-m += openibnal
+@BUILD_IIBNAL_TRUE@subdir-m += iibnal
 @BUILD_QSWNAL_TRUE@subdir-m += qswnal
 subdir-m += socknal
 
index 9d04a46..0090364 100644 (file)
@@ -3,4 +3,4 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-SUBDIRS = gmnal ibnal qswnal socknal
+SUBDIRS = gmnal iibnal openibnal qswnal socknal 
index ad46b90..9c4425b 100644 (file)
 #include "linux/kp30.h"
 #include "portals/p30.h"
 
-#include "portals/lib-nal.h"
+#include "portals/nal.h"
 #include "portals/lib-p30.h"
 
 #define GM_STRONG_TYPES 1
+#ifdef VERSION
+#undef VERSION
+#endif
 #include "gm.h"
 #include "gm_internal.h"
 
@@ -190,8 +193,6 @@ typedef struct _gmnal_rxtwe {
 #define NRXTHREADS 10 /* max number of receiver threads */
 
 typedef struct _gmnal_data_t {
-       int             refcnt;
-       spinlock_t      cb_lock;
        spinlock_t      stxd_lock;
        struct semaphore stxd_token;
        gmnal_stxd_t    *stxd;
@@ -206,7 +207,7 @@ typedef struct _gmnal_data_t {
        gmnal_srxd_t    *srxd;
        struct gm_hash  *srxd_hash;
        nal_t           *nal;   
-       nal_cb_t        *nal_cb;
+       lib_nal_t       *libnal;
        struct gm_port  *gm_port;
        unsigned int    gm_local_nid;
        unsigned int    gm_global_nid;
@@ -299,7 +300,6 @@ extern gmnal_data_t *global_nal_data;
 #define GMNAL_GM_LOCK_INIT(a)          spin_lock_init(&a->gm_lock);
 #define GMNAL_GM_LOCK(a)               spin_lock(&a->gm_lock);
 #define GMNAL_GM_UNLOCK(a)             spin_unlock(&a->gm_lock);
-#define GMNAL_CB_LOCK_INIT(a)          spin_lock_init(&a->cb_lock);
 
 
 /*
@@ -309,13 +309,16 @@ extern gmnal_data_t       *global_nal_data;
 /*
  *     API NAL
  */
+int gmnal_api_startup(nal_t *, ptl_pid_t, 
+                      ptl_ni_limits_t *, ptl_ni_limits_t *);
+
 int gmnal_api_forward(nal_t *, int, void *, size_t, void *, size_t);
 
-int gmnal_api_shutdown(nal_t *, int);
+void gmnal_api_shutdown(nal_t *);
 
 int gmnal_api_validate(nal_t *, void *, size_t);
 
-void gmnal_api_yield(nal_t *);
+void gmnal_api_yield(nal_t *, unsigned long *, int);
 
 void gmnal_api_lock(nal_t *, unsigned long *);
 
@@ -323,15 +326,9 @@ void gmnal_api_unlock(nal_t *, unsigned long *);
 
 
 #define GMNAL_INIT_NAL(a)      do {    \
-                               a->forward = gmnal_api_forward; \
-                               a->shutdown = gmnal_api_shutdown; \
-                               a->validate = NULL; \
-                               a->yield = gmnal_api_yield; \
-                               a->lock = gmnal_api_lock; \
-                               a->unlock = gmnal_api_unlock; \
-                               a->timeout = NULL; \
-                               a->refct = 1; \
-                               a->nal_data = NULL; \
+                                (a)->nal_ni_init = gmnal_api_startup; \
+                               (a)->nal_ni_fini = gmnal_api_shutdown; \
+                               (a)->nal_data = NULL; \
                                } while (0)
 
 
@@ -339,63 +336,35 @@ void gmnal_api_unlock(nal_t *, unsigned long *);
  *     CB NAL
  */
 
-int gmnal_cb_send(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
-       int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t);
-
-int gmnal_cb_send_pages(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
-       int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t);
-
-int gmnal_cb_recv(nal_cb_t *, void *, lib_msg_t *, 
-       unsigned int, struct iovec *, size_t, size_t);
-
-int gmnal_cb_recv_pages(nal_cb_t *, void *, lib_msg_t *, 
-       unsigned int, ptl_kiov_t *, size_t, size_t);
-
-int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t);
-
-int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t);
-
-int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *);
-
-void *gmnal_cb_malloc(nal_cb_t *, size_t);
-
-void gmnal_cb_free(nal_cb_t *, void *, size_t);
-
-void gmnal_cb_unmap(nal_cb_t *, unsigned int, struct iovec*, void **);
-
-int  gmnal_cb_map(nal_cb_t *, unsigned int, struct iovec*, void **); 
+ptl_err_t gmnal_cb_send(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *,
+       int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t, size_t);
 
-void gmnal_cb_printf(nal_cb_t *, const char *fmt, ...);
+ptl_err_t gmnal_cb_send_pages(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *,
+       int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t, size_t);
 
-void gmnal_cb_cli(nal_cb_t *, unsigned long *);
+ptl_err_t gmnal_cb_recv(lib_nal_t *, void *, lib_msg_t *, 
+       unsigned int, struct iovec *, size_t, size_t, size_t);
 
-void gmnal_cb_sti(nal_cb_t *, unsigned long *);
+ptl_err_t gmnal_cb_recv_pages(lib_nal_t *, void *, lib_msg_t *, 
+       unsigned int, ptl_kiov_t *, size_t, size_t, size_t);
 
-int gmnal_cb_dist(nal_cb_t *, ptl_nid_t, unsigned long *);
+int gmnal_cb_dist(lib_nal_t *, ptl_nid_t, unsigned long *);
 
-nal_t *gmnal_init(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t rpid);
+int gmnal_init(void);
 
 void  gmnal_fini(void);
 
 
 
 #define GMNAL_INIT_NAL_CB(a)   do {    \
-                               a->cb_send = gmnal_cb_send; \
-                               a->cb_send_pages = gmnal_cb_send_pages; \
-                               a->cb_recv = gmnal_cb_recv; \
-                               a->cb_recv_pages = gmnal_cb_recv_pages; \
-                               a->cb_read = gmnal_cb_read; \
-                               a->cb_write = gmnal_cb_write; \
-                               a->cb_callback = gmnal_cb_callback; \
-                               a->cb_malloc = gmnal_cb_malloc; \
-                               a->cb_free = gmnal_cb_free; \
-                               a->cb_map = NULL; \
-                               a->cb_unmap = NULL; \
-                               a->cb_printf = gmnal_cb_printf; \
-                               a->cb_cli = gmnal_cb_cli; \
-                               a->cb_sti = gmnal_cb_sti; \
-                               a->cb_dist = gmnal_cb_dist; \
-                               a->nal_data = NULL; \
+                               a->libnal_send = gmnal_cb_send; \
+                               a->libnal_send_pages = gmnal_cb_send_pages; \
+                               a->libnal_recv = gmnal_cb_recv; \
+                               a->libnal_recv_pages = gmnal_cb_recv_pages; \
+                               a->libnal_map = NULL; \
+                               a->libnal_unmap = NULL; \
+                               a->libnal_dist = gmnal_cb_dist; \
+                               a->libnal_data = NULL; \
                                } while (0)
 
 
@@ -450,11 +419,11 @@ void              gmnal_remove_rxtwe(gmnal_data_t *);
 /*
  *     Small messages
  */
-int            gmnal_small_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int, 
-                               struct iovec *, size_t, size_t);
-int            gmnal_small_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, 
+int            gmnal_small_rx(lib_nal_t *, void *, lib_msg_t *, unsigned int, 
+                               struct iovec *, size_t, size_t, size_t);
+int            gmnal_small_tx(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, 
                                int, ptl_nid_t, ptl_pid_t, 
-                               unsigned int, struct iovec*, int);
+                               unsigned int, struct iovec*, size_t, int);
 void           gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t);
 
 
@@ -462,12 +431,12 @@ void              gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t);
 /*
  *     Large messages
  */
-int            gmnal_large_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int, 
-                               struct iovec *, size_t, size_t);
+int            gmnal_large_rx(lib_nal_t *, void *, lib_msg_t *, unsigned int, 
+                               struct iovec *, size_t, size_t, size_t);
 
-int            gmnal_large_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, 
+int            gmnal_large_tx(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, 
                                int, ptl_nid_t, ptl_pid_t, unsigned int, 
-                               struct iovec*, int);
+                               struct iovec*, size_t, int);
 
 void           gmnal_large_tx_callback(gm_port_t *, void *, gm_status_t);
 
index 1442aa7..bd6c83e 100644 (file)
@@ -30,6 +30,9 @@
 gmnal_data_t   *global_nal_data = NULL;
 #define         GLOBAL_NID_STR_LEN      16
 char            global_nid_str[GLOBAL_NID_STR_LEN] = {0};
+ptl_handle_ni_t kgmnal_ni;
+
+extern int gmnal_cmd(struct portals_cfg *pcfg, void *private);
 
 /*
  *      Write the global nid /proc/sys/gmnal/globalnid
@@ -50,224 +53,112 @@ static ctl_table gmnalnal_top_sysctl_table[] = {
         { 0 }
 };
 
-
-
-
-
-
-/*
- *     gmnal_api_forward
- *     This function takes a pack block of arguments from the NAL API
- *     module and passes them to the NAL CB module. The CB module unpacks
- *     the args and calls the appropriate function indicated by index.
- *     Typically this function is used to pass args between kernel and use
- *     space.
- *     As lgmanl exists entirely in kernel, just pass the arg block directly 
- *     to the NAL CB, buy passing the args to lib_dispatch
- *     Arguments are
- *     nal_t   nal     Our nal
- *     int     index   the api function that initiated this call 
- *     void    *args   packed block of function args
- *     size_t  arg_len length of args block
- *     void    *ret    A return value for the API NAL
- *     size_t  ret_len Size of the return value
- *     
- */
-
-int
-gmnal_api_forward(nal_t *nal, int index, void *args, size_t arg_len,
-               void *ret, size_t ret_len)
-{
-
-       nal_cb_t        *nal_cb = NULL;
-       gmnal_data_t    *nal_data = NULL;
-
-
-
-
-
-       if (!nal || !args || (index < 0) || (arg_len < 0)) {
-                       CDEBUG(D_ERROR, "Bad args to gmnal_api_forward\n");
-               return (PTL_FAIL);
-       }
-
-       if (ret && (ret_len <= 0)) {
-               CDEBUG(D_ERROR, "Bad args to gmnal_api_forward\n");
-               return (PTL_FAIL);
-       }
-
-
-       if (!nal->nal_data) {
-               CDEBUG(D_ERROR, "bad nal, no nal data\n");      
-               return (PTL_FAIL);
-       }
-       
-       nal_data = nal->nal_data;
-       CDEBUG(D_INFO, "nal_data is [%p]\n", nal_data); 
-
-       if (!nal_data->nal_cb) {
-               CDEBUG(D_ERROR, "bad nal_data, no nal_cb\n");   
-               return (PTL_FAIL);
-       }
-       
-       nal_cb = nal_data->nal_cb;
-       CDEBUG(D_INFO, "nal_cb is [%p]\n", nal_cb);     
-       
-       CDEBUG(D_PORTALS, "gmnal_api_forward calling lib_dispatch\n");
-       lib_dispatch(nal_cb, NULL, index, args, ret);
-       CDEBUG(D_PORTALS, "gmnal_api_forward returns from lib_dispatch\n");
-
-       return(PTL_OK);
-}
-
-
 /*
  *     gmnal_api_shutdown
+ *      nal_refct == 0 => called on last matching PtlNIFini()
  *     Close down this interface and free any resources associated with it
  *     nal_t   nal     our nal to shutdown
  */
-int
-gmnal_api_shutdown(nal_t *nal, int interface)
-{
-
-       gmnal_data_t    *nal_data = nal->nal_data;
-
-       CDEBUG(D_TRACE, "gmnal_api_shutdown: nal_data [%p]\n", nal_data);
-
-       return(PTL_OK);
-}
-
-
-/*
- *     gmnal_api_validate
- *     validate a user address for use in communications
- *     There's nothing to be done here
- */
-int
-gmnal_api_validate(nal_t *nal, void *base, size_t extent)
-{
-
-       return(PTL_OK);
-}
-
-
-
-/*
- *     gmnal_api_yield
- *     Give up the processor
- */
 void
-gmnal_api_yield(nal_t *nal)
+gmnal_api_shutdown(nal_t *nal)
 {
-       CDEBUG(D_TRACE, "gmnal_api_yield : nal [%p]\n", nal);
-
-       set_current_state(TASK_INTERRUPTIBLE);
-       schedule();
-
-       return;
-}
-
-
-
-/*
- *     gmnal_api_lock
- *     Take a threadsafe lock
- */
-void
-gmnal_api_lock(nal_t *nal, unsigned long *flags)
-{
-
        gmnal_data_t    *nal_data;
-       nal_cb_t        *nal_cb;
-
-       nal_data = nal->nal_data;
-       nal_cb = nal_data->nal_cb;
+       lib_nal_t       *libnal;
 
-       nal_cb->cb_cli(nal_cb, flags);
+        if (nal->nal_refct != 0)
+                return;
+        
 
-       return;
-}
+        LASSERT(nal == global_nal_data->nal);
+        libnal = (lib_nal_t *)nal->nal_data;
+        nal_data = (gmnal_data_t *)libnal->libnal_data;
+        LASSERT(nal_data == global_nal_data);
+       CDEBUG(D_TRACE, "gmnal_api_shutdown: nal_data [%p]\n", nal_data);
 
-/*
- *     gmnal_api_unlock
- *     Release a threadsafe lock
- */
-void
-gmnal_api_unlock(nal_t *nal, unsigned long *flags)
-{
-       gmnal_data_t    *nal_data;
-       nal_cb_t        *nal_cb;
+        /* Stop portals calling our ioctl handler */
+        libcfs_nal_cmd_unregister(GMNAL);
 
-       nal_data = nal->nal_data;
-       nal_cb = nal_data->nal_cb;
+        /* XXX for shutdown "under fire" we probably need to set a shutdown
+         * flag so when lib calls us we fail immediately and dont queue any
+         * more work but our threads can still call into lib OK.  THEN
+         * shutdown our threads, THEN lib_fini() */
+        lib_fini(libnal);
 
-       nal_cb->cb_sti(nal_cb, flags);
+       gmnal_stop_rxthread(nal_data);
+       gmnal_stop_ctthread(nal_data);
+       gmnal_free_txd(nal_data);
+       gmnal_free_srxd(nal_data);
+       GMNAL_GM_LOCK(nal_data);
+       gm_close(nal_data->gm_port);
+       gm_finalize();
+       GMNAL_GM_UNLOCK(nal_data);
+        if (nal_data->sysctl)
+                unregister_sysctl_table (nal_data->sysctl);
+        /* Don't free 'nal'; it's a static struct */
+       PORTAL_FREE(nal_data, sizeof(gmnal_data_t));    
+       PORTAL_FREE(libnal, sizeof(lib_nal_t));
 
-       return;
+        global_nal_data = NULL;
+        PORTAL_MODULE_UNUSE;
 }
 
 
-nal_t *
-gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, 
-           ptl_pid_t rpid)
+int
+gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid,
+                  ptl_ni_limits_t *requested_limits,
+                  ptl_ni_limits_t *actual_limits)
 {
 
-       nal_t           *nal = NULL;
-       nal_cb_t        *nal_cb = NULL;
+       lib_nal_t       *libnal = NULL;
        gmnal_data_t    *nal_data = NULL;
        gmnal_srxd_t    *srxd = NULL;
        gm_status_t     gm_status;
        unsigned int    local_nid = 0, global_nid = 0;
-       ptl_nid_t       portals_nid;
-       ptl_pid_t       portals_pid = 0;
+        ptl_process_id_t process_id;
+
+        if (nal->nal_refct != 0) {
+                if (actual_limits != NULL) {
+                        libnal = (lib_nal_t *)nal->nal_data;
+                        *actual_limits = libnal->libnal_ni.ni_actual_limits;
+                }
+                return (PTL_OK);
+        }
 
+        /* Called on first PtlNIInit() */
 
-       CDEBUG(D_TRACE, "gmnal_init : interface [%d], ptl_size [%d], "
-              "ac_size[%d]\n", interface, ptl_size, ac_size);
+       CDEBUG(D_TRACE, "startup\n");
 
+        LASSERT(global_nal_data == NULL);
 
        PORTAL_ALLOC(nal_data, sizeof(gmnal_data_t));
        if (!nal_data) {
                CDEBUG(D_ERROR, "can't get memory\n");
-               return(NULL);
+               return(PTL_NO_SPACE);
        }       
        memset(nal_data, 0, sizeof(gmnal_data_t));
        /*
         *      set the small message buffer size 
         */
-       nal_data->refcnt = 1;
 
        CDEBUG(D_INFO, "Allocd and reset nal_data[%p]\n", nal_data);
        CDEBUG(D_INFO, "small_msg_size is [%d]\n", nal_data->small_msg_size);
 
-       PORTAL_ALLOC(nal, sizeof(nal_t));
-       if (!nal) {
+       PORTAL_ALLOC(libnal, sizeof(lib_nal_t));
+       if (!libnal) {
                PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
-               return(NULL);
+               return(PTL_NO_SPACE);
        }
-       memset(nal, 0, sizeof(nal_t));
-       CDEBUG(D_INFO, "Allocd and reset nal[%p]\n", nal);
+       memset(libnal, 0, sizeof(lib_nal_t));
+       CDEBUG(D_INFO, "Allocd and reset libnal[%p]\n", libnal);
 
-       PORTAL_ALLOC(nal_cb, sizeof(nal_cb_t));
-       if (!nal_cb) {
-               PORTAL_FREE(nal, sizeof(nal_t));
-               PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
-               return(NULL);
-       }
-       memset(nal_cb, 0, sizeof(nal_cb_t));
-       CDEBUG(D_INFO, "Allocd and reset nal_cb[%p]\n", nal_cb);
-
-       GMNAL_INIT_NAL(nal);
-       GMNAL_INIT_NAL_CB(nal_cb);
+       GMNAL_INIT_NAL_CB(libnal);
        /*
         *      String them all together
         */
-       nal->nal_data = (void*)nal_data;
-       nal_cb->nal_data = (void*)nal_data;
+       libnal->libnal_data = (void*)nal_data;
        nal_data->nal = nal;
-       nal_data->nal_cb = nal_cb;
+       nal_data->libnal = libnal;
 
-       GMNAL_CB_LOCK_INIT(nal_data);
        GMNAL_GM_LOCK_INIT(nal_data);
 
 
@@ -277,15 +168,14 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
        CDEBUG(D_INFO, "Calling gm_init\n");
        if (gm_init() != GM_SUCCESS) {
                CDEBUG(D_ERROR, "call to gm_init failed\n");
-               PORTAL_FREE(nal, sizeof(nal_t));        
                PORTAL_FREE(nal_data, sizeof(gmnal_data_t));    
-               PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
-               return(NULL);
+               PORTAL_FREE(libnal, sizeof(lib_nal_t));
+               return(PTL_FAIL);
        }
 
 
-       CDEBUG(D_NET, "Calling gm_open with interface [%d], port [%d], "
-                      "name [%s], version [%d]\n", interface, GMNAL_GM_PORT, 
+       CDEBUG(D_NET, "Calling gm_open with port [%d], "
+                      "name [%s], version [%d]\n", GMNAL_GM_PORT, 
               "gmnal", GM_API_VERSION);
 
        GMNAL_GM_LOCK(nal_data);
@@ -323,10 +213,9 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
                GMNAL_GM_LOCK(nal_data);
                gm_finalize();
                GMNAL_GM_UNLOCK(nal_data);
-               PORTAL_FREE(nal, sizeof(nal_t));        
                PORTAL_FREE(nal_data, sizeof(gmnal_data_t));    
-               PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
-               return(NULL);
+               PORTAL_FREE(libnal, sizeof(lib_nal_t));
+               return(PTL_FAIL);
        }
 
        
@@ -341,10 +230,9 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
                gm_close(nal_data->gm_port);
                gm_finalize();
                GMNAL_GM_UNLOCK(nal_data);
-               PORTAL_FREE(nal, sizeof(nal_t));        
                PORTAL_FREE(nal_data, sizeof(gmnal_data_t));    
-               PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
-               return(NULL);
+               PORTAL_FREE(libnal, sizeof(lib_nal_t));
+               return(PTL_FAIL);
        }
 
 
@@ -371,10 +259,9 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
                gm_close(nal_data->gm_port);
                gm_finalize();
                GMNAL_GM_UNLOCK(nal_data);
-               PORTAL_FREE(nal, sizeof(nal_t));        
                PORTAL_FREE(nal_data, sizeof(gmnal_data_t));    
-               PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
-               return(NULL);
+               PORTAL_FREE(libnal, sizeof(lib_nal_t));
+               return(PTL_FAIL);
        }
 
        gmnal_start_kernel_threads(nal_data);
@@ -404,13 +291,14 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
                gm_close(nal_data->gm_port);
                gm_finalize();
                GMNAL_GM_UNLOCK(nal_data);
-               PORTAL_FREE(nal, sizeof(nal_t));        
                PORTAL_FREE(nal_data, sizeof(gmnal_data_t));    
-               PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
-               return(NULL);
+               PORTAL_FREE(libnal, sizeof(lib_nal_t));
+               return(PTL_FAIL);
        }
+
        nal_data->gm_local_nid = local_nid;
        CDEBUG(D_INFO, "Local node id is [%u]\n", local_nid);
+
        GMNAL_GM_LOCK(nal_data);
        gm_status = gm_node_id_to_global_id(nal_data->gm_port, local_nid, 
                                            &global_nid);
@@ -425,10 +313,9 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
                gm_close(nal_data->gm_port);
                gm_finalize();
                GMNAL_GM_UNLOCK(nal_data);
-               PORTAL_FREE(nal, sizeof(nal_t));        
                PORTAL_FREE(nal_data, sizeof(gmnal_data_t));    
-               PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
-               return(NULL);
+               PORTAL_FREE(libnal, sizeof(lib_nal_t));
+               return(PTL_FAIL);
        }
        CDEBUG(D_INFO, "Global node id is [%u]\n", global_nid);
        nal_data->gm_global_nid = global_nid;
@@ -437,13 +324,15 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
 /*
        pid = gm_getpid();
 */
-       CDEBUG(D_INFO, "portals_pid is [%u]\n", portals_pid);
-       portals_nid = (unsigned long)global_nid;
-       CDEBUG(D_INFO, "portals_nid is ["LPU64"]\n", portals_nid);
+        process_id.pid = requested_pid;
+        process_id.nid = global_nid;
+        
+       CDEBUG(D_INFO, "portals_pid is [%u]\n", process_id.pid);
+       CDEBUG(D_INFO, "portals_nid is ["LPU64"]\n", process_id.nid);
        
        CDEBUG(D_PORTALS, "calling lib_init\n");
-       if (lib_init(nal_cb, portals_nid, portals_pid, 1024, ptl_size
-                    ac_size) != PTL_OK) {
+       if (lib_init(libnal, nal, process_id
+                     requested_limits, actual_limits) != PTL_OK) {
                CDEBUG(D_ERROR, "lib_init failed\n");
                gmnal_stop_rxthread(nal_data);
                gmnal_stop_ctthread(nal_data);
@@ -453,48 +342,83 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
                gm_close(nal_data->gm_port);
                gm_finalize();
                GMNAL_GM_UNLOCK(nal_data);
-               PORTAL_FREE(nal, sizeof(nal_t));        
                PORTAL_FREE(nal_data, sizeof(gmnal_data_t));    
-               PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
-               return(NULL);
+               PORTAL_FREE(libnal, sizeof(lib_nal_t));
+               return(PTL_FAIL);
                
        }
+
+       if (libcfs_nal_cmd_register(GMNAL, &gmnal_cmd, libnal->libnal_data) != 0) {
+               CDEBUG(D_INFO, "libcfs_nal_cmd_register failed\n");
+
+                /* XXX these cleanup cases should be restructured to
+                 * minimise duplication... */
+                lib_fini(libnal);
+                
+               gmnal_stop_rxthread(nal_data);
+               gmnal_stop_ctthread(nal_data);
+               gmnal_free_txd(nal_data);
+               gmnal_free_srxd(nal_data);
+               GMNAL_GM_LOCK(nal_data);
+               gm_close(nal_data->gm_port);
+               gm_finalize();
+               GMNAL_GM_UNLOCK(nal_data);
+               PORTAL_FREE(nal_data, sizeof(gmnal_data_t));    
+               PORTAL_FREE(libnal, sizeof(lib_nal_t));
+               return(PTL_FAIL);
+        }
+
+        /* might be better to initialise this at module load rather than in
+         * NAL startup */
         nal_data->sysctl = NULL;
         nal_data->sysctl = register_sysctl_table (gmnalnal_top_sysctl_table, 0);
 
        
        CDEBUG(D_INFO, "gmnal_init finished\n");
        global_nal_data = nal->nal_data;
-       return(nal);
+
+        /* no unload now until shutdown */
+        PORTAL_MODULE_USE;
+        
+       return(PTL_OK);
 }
 
+nal_t the_gm_nal;
+
+/* 
+ *        Called when module loaded
+ */
+int gmnal_init(void)
+{
+        int    rc;
+
+       memset(&the_gm_nal, 0, sizeof(nal_t));
+       CDEBUG(D_INFO, "reset nal[%p]\n", &the_gm_nal);
+       GMNAL_INIT_NAL(&the_gm_nal);
+
+        rc = ptl_register_nal(GMNAL, &the_gm_nal);
+        if (rc != PTL_OK)
+                CERROR("Can't register GMNAL: %d\n", rc);
+        rc = PtlNIInit(GMNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kgmnal_ni);
+        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+                ptl_unregister_nal(GMNAL);
+                return (-ENODEV);
+        }
+
+        return (rc);
+}
 
+                
 
 /*
  *     Called when module removed
  */
 void gmnal_fini()
 {
-       gmnal_data_t    *nal_data = global_nal_data;
-       nal_t           *nal = nal_data->nal;
-       nal_cb_t        *nal_cb = nal_data->nal_cb;
-
        CDEBUG(D_TRACE, "gmnal_fini\n");
 
-       PtlNIFini(kgmnal_ni);
-       lib_fini(nal_cb);
+        LASSERT(global_nal_data == NULL);
+        PtlNIFini(kgmnal_ni);
 
-       gmnal_stop_rxthread(nal_data);
-       gmnal_stop_ctthread(nal_data);
-       gmnal_free_txd(nal_data);
-       gmnal_free_srxd(nal_data);
-       GMNAL_GM_LOCK(nal_data);
-       gm_close(nal_data->gm_port);
-       gm_finalize();
-       GMNAL_GM_UNLOCK(nal_data);
-        if (nal_data->sysctl)
-                unregister_sysctl_table (nal_data->sysctl);
-       PORTAL_FREE(nal, sizeof(nal_t));        
-       PORTAL_FREE(nal_data, sizeof(gmnal_data_t));    
-       PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
+        ptl_unregister_nal(GMNAL);
 }
index 1f28746..0ebf437 100644 (file)
 
 #include "gmnal.h"
 
-int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, 
-                  unsigned int niov, struct iovec *iov, size_t mlen
-                  size_t rlen)
+ptl_err_t gmnal_cb_recv(lib_nal_t *libnal, void *private, lib_msg_t *cookie, 
+                  unsigned int niov, struct iovec *iov, size_t offset
+                  size_t mlen, size_t rlen)
 {
        gmnal_srxd_t    *srxd = (gmnal_srxd_t*)private;
        int             status = PTL_OK;
 
 
-       CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p], "
-              "niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", 
-              nal_cb, private, cookie, niov, iov, mlen, rlen);
+       CDEBUG(D_TRACE, "gmnal_cb_recv libnal [%p], private[%p], cookie[%p], "
+              "niov[%d], iov [%p], offset["LPSZ"], mlen["LPSZ"], rlen["LPSZ"]\n", 
+              libnal, private, cookie, niov, iov, offset, mlen, rlen);
 
        switch(srxd->type) {
        case(GMNAL_SMALL_MESSAGE):
                CDEBUG(D_INFO, "gmnal_cb_recv got small message\n");
-               status = gmnal_small_rx(nal_cb, private, cookie, niov, 
-                                        iov, mlen, rlen);
+               status = gmnal_small_rx(libnal, private, cookie, niov, 
+                                        iov, offset, mlen, rlen);
        break;
        case(GMNAL_LARGE_MESSAGE_INIT):
                CDEBUG(D_INFO, "gmnal_cb_recv got large message init\n");
-               status = gmnal_large_rx(nal_cb, private, cookie, niov, 
-                                        iov, mlen, rlen);
+               status = gmnal_large_rx(libnal, private, cookie, niov, 
+                                        iov, offset, mlen, rlen);
        }
                
 
@@ -56,9 +56,9 @@ int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
        return(status);
 }
 
-int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, 
-                        unsigned int kniov, ptl_kiov_t *kiov, size_t mlen
-                        size_t rlen)
+ptl_err_t gmnal_cb_recv_pages(lib_nal_t *libnal, void *private, lib_msg_t *cookie, 
+                        unsigned int kniov, ptl_kiov_t *kiov, size_t offset
+                        size_t mlen, size_t rlen)
 {
        gmnal_srxd_t    *srxd = (gmnal_srxd_t*)private;
        int             status = PTL_OK;
@@ -67,9 +67,9 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
        ptl_kiov_t      *kiov_dup = kiov;;
 
 
-       CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p], "
-              "cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n",
-              nal_cb, private, cookie, kniov, kiov, mlen, rlen);
+       CDEBUG(D_TRACE, "gmnal_cb_recv_pages libnal [%p],private[%p], "
+              "cookie[%p], kniov[%d], kiov [%p], offset["LPSZ"], mlen["LPSZ"], rlen["LPSZ"]\n",
+              libnal, private, cookie, kniov, kiov, offset, mlen, rlen);
 
        if (srxd->type == GMNAL_SMALL_MESSAGE) {
                PORTAL_ALLOC(iovec, sizeof(struct iovec)*kniov);
@@ -98,8 +98,8 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
                         kiov++;
                }
                CDEBUG(D_INFO, "calling gmnal_small_rx\n");
-               status = gmnal_small_rx(nal_cb, private, cookie, kniov, 
-                                        iovec_dup, mlen, rlen);
+               status = gmnal_small_rx(libnal, private, cookie, kniov, 
+                                        iovec_dup, offset, mlen, rlen);
                for (i=0; i<kniov; i++) {
                        kunmap(kiov_dup->kiov_page);
                        kiov_dup++;
@@ -113,34 +113,35 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
 }
 
 
-int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, 
+ptl_err_t gmnal_cb_send(lib_nal_t *libnal, void *private, lib_msg_t *cookie, 
                   ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
-                  unsigned int niov, struct iovec *iov, size_t len)
+                  unsigned int niov, struct iovec *iov, size_t offset, size_t len)
 {
 
        gmnal_data_t    *nal_data;
 
 
-       CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] len["LPSZ"] nid["LPU64"]\n", 
-              niov, len, nid);
-       nal_data = nal_cb->nal_data;
+       CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] offset["LPSZ"] len["LPSZ"] nid["LPU64"]\n", 
+              niov, offset, len, nid);
+       nal_data = libnal->libnal_data;
        
        if (GMNAL_IS_SMALL_MESSAGE(nal_data, niov, iov, len)) {
                CDEBUG(D_INFO, "This is a small message send\n");
-               gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, pid, 
-                               niov, iov, len);
+               gmnal_small_tx(libnal, private, cookie, hdr, type, nid, pid, 
+                               niov, iov, offset,  len);
        } else {
                CDEBUG(D_ERROR, "Large message send it is not supported\n");
-               lib_finalize(nal_cb, private, cookie, PTL_FAIL);
+               lib_finalize(libnal, private, cookie, PTL_FAIL);
                return(PTL_FAIL);
-               gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, 
-                               niov, iov, len);
+               gmnal_large_tx(libnal, private, cookie, hdr, type, nid, pid, 
+                               niov, iov, offset, len);
        }
        return(PTL_OK);
 }
 
-int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, 
-                        ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,                         unsigned int kniov, ptl_kiov_t *kiov, size_t len)
+ptl_err_t gmnal_cb_send_pages(lib_nal_t *libnal, void *private, lib_msg_t *cookie, 
+                        ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                         unsigned int kniov, ptl_kiov_t *kiov, size_t offset, size_t len)
 {
 
        int     i = 0;
@@ -148,8 +149,9 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
        struct  iovec   *iovec = NULL, *iovec_dup = NULL;
        ptl_kiov_t      *kiov_dup = kiov;
 
-       CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] len["LPSZ"]\n", nid, kniov, len);
-       nal_data = nal_cb->nal_data;
+       CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] offset["LPSZ"] len["LPSZ"]\n", 
+               nid, kniov, offset, len);
+       nal_data = libnal->libnal_data;
        PORTAL_ALLOC(iovec, kniov*sizeof(struct iovec));
         iovec_dup = iovec;
        if (GMNAL_IS_SMALL_MESSAGE(nal_data, 0, NULL, len)) {
@@ -168,8 +170,8 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
                         iovec++;
                         kiov++;
                }
-               gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, 
-                               pid, kniov, iovec_dup, len);
+               gmnal_small_tx(libnal, private, cookie, hdr, type, nid, 
+                               pid, kniov, iovec_dup, offset, len);
        } else {
                CDEBUG(D_ERROR, "Large message send it is not supported yet\n");
                return(PTL_FAIL);
@@ -185,8 +187,8 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
                         iovec++;
                         kiov++;
                }
-               gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, 
-                               pid, kniov, iovec, len);
+               gmnal_large_tx(libnal, private, cookie, hdr, type, nid, 
+                               pid, kniov, iovec, offset, len);
        }
        for (i=0; i<kniov; i++) {
                kunmap(kiov_dup->kiov_page);
@@ -196,83 +198,7 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
        return(PTL_OK);
 }
 
-int gmnal_cb_read(nal_cb_t *nal_cb, void *private, void *dst, 
-                  user_ptr src, size_t len)
-{
-       gm_bcopy(src, dst, len);
-       return(PTL_OK);
-}
-
-int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst, 
-                   void *src, size_t len)
-{
-       gm_bcopy(src, dst, len);
-       return(PTL_OK);
-}
-
-int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, 
-                      ptl_event_t *ev)
-{
-
-       if (eq->event_callback != NULL) {
-               CDEBUG(D_INFO, "found callback\n");
-               eq->event_callback(ev);
-       }
-       
-       return(PTL_OK);
-}
-
-void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len)
-{
-       void *ptr = NULL;
-       CDEBUG(D_TRACE, "gmnal_cb_malloc len["LPSZ"]\n", len);
-       PORTAL_ALLOC(ptr, len);
-       return(ptr);
-}
-
-void gmnal_cb_free(nal_cb_t *nal_cb, void *buf, size_t len)
-{
-       CDEBUG(D_TRACE, "gmnal_cb_free :: buf[%p] len["LPSZ"]\n", buf, len);
-       PORTAL_FREE(buf, len);
-       return;
-}
-
-void gmnal_cb_unmap(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov, 
-                    void **addrkey)
-{
-       return;
-}
-
-int  gmnal_cb_map(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov, 
-                  void**addrkey)
-{
-       return(PTL_OK);
-}
-
-void gmnal_cb_printf(nal_cb_t *nal_cb, const char *fmt, ...)
-{
-       CDEBUG(D_TRACE, "gmnal_cb_printf\n");
-       printk(fmt);
-       return;
-}
-
-void gmnal_cb_cli(nal_cb_t *nal_cb, unsigned long *flags)
-{
-       gmnal_data_t    *nal_data = (gmnal_data_t*)nal_cb->nal_data;
-
-       spin_lock_irqsave(&nal_data->cb_lock, *flags);
-       return;
-}
-
-void gmnal_cb_sti(nal_cb_t *nal_cb, unsigned long *flags)
-{
-       gmnal_data_t    *nal_data = (gmnal_data_t*)nal_cb->nal_data;
-
-       spin_unlock_irqrestore(&nal_data->cb_lock, *flags);
-       return;
-}
-
-int gmnal_cb_dist(nal_cb_t *nal_cb, ptl_nid_t nid, unsigned long *dist)
+int gmnal_cb_dist(lib_nal_t *libnal, ptl_nid_t nid, unsigned long *dist)
 {
        CDEBUG(D_TRACE, "gmnal_cb_dist\n");
        if (dist)
index 1bcd9bd..6a8fcbc 100644 (file)
@@ -189,6 +189,7 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type)
        unsigned int snode, sport, type, length;
        gmnal_msghdr_t  *gmnal_msghdr;
        ptl_hdr_t       *portals_hdr;
+        int              rc;
 
        CDEBUG(D_INFO, "nal_data [%p], we[%p] type [%d]\n", 
               nal_data, we, gmnal_type);
@@ -219,10 +220,12 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type)
         */
        srxd = gmnal_rxbuffer_to_srxd(nal_data, buffer);
        CDEBUG(D_INFO, "Back from gmnal_rxbuffer_to_srxd\n");
-       srxd->nal_data = nal_data;
        if (!srxd) {
                CDEBUG(D_ERROR, "Failed to get receive descriptor\n");
-               lib_parse(nal_data->nal_cb, portals_hdr, srxd);
+                /* I think passing a NULL srxd to lib_parse will crash
+                 * gmnal_recv() */
+                LBUG();
+               lib_parse(nal_data->libnal, portals_hdr, srxd);
                return(GMNAL_STATUS_FAIL);
        }
 
@@ -234,6 +237,7 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type)
                return(GMNAL_STATUS_OK);
        }
 
+       srxd->nal_data = nal_data;
        srxd->type = gmnal_type;
        srxd->nsiov = gmnal_msghdr->niov;
        srxd->gm_source_node = gmnal_msghdr->sender_node_id;
@@ -245,7 +249,12 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type)
         *      cb_recv is responsible for returning the buffer 
         *      for future receive
         */
-       lib_parse(nal_data->nal_cb, portals_hdr, srxd);
+       rc = lib_parse(nal_data->libnal, portals_hdr, srxd);
+
+        if (rc != PTL_OK) {
+                /* I just received garbage; take appropriate action... */
+                LBUG();
+        }
 
        return(GMNAL_STATUS_OK);
 }
@@ -309,19 +318,19 @@ gmnal_rx_bad(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, gmnal_srxd_t *srxd)
  *     Call lib_finalize
  */
 int
-gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, 
-               unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+gmnal_small_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, 
+               unsigned int niov, struct iovec *iov, size_t offset, size_t mlen, size_t rlen)
 {
        gmnal_srxd_t    *srxd = NULL;
        void    *buffer = NULL;
-       gmnal_data_t    *nal_data = (gmnal_data_t*)nal_cb->nal_data;
+       gmnal_data_t    *nal_data = (gmnal_data_t*)libnal->libnal_data;
 
 
        CDEBUG(D_TRACE, "niov [%d] mlen["LPSZ"]\n", niov, mlen);
 
        if (!private) {
                CDEBUG(D_ERROR, "gmnal_small_rx no context\n");
-               lib_finalize(nal_cb, private, cookie, PTL_FAIL);
+               lib_finalize(libnal, private, cookie, PTL_FAIL);
                return(PTL_FAIL);
        }
 
@@ -331,11 +340,24 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
        buffer += sizeof(ptl_hdr_t);
 
        while(niov--) {
-               CDEBUG(D_INFO, "processing [%p] len ["LPSZ"]\n", iov, 
-                      iov->iov_len);
-               gm_bcopy(buffer, iov->iov_base, iov->iov_len);                  
-               buffer += iov->iov_len;
-               iov++;
+                if (offset >= iov->iov_len) {
+                        offset -= iov->iov_len;
+                } else if (offset > 0) {
+                       CDEBUG(D_INFO, "processing [%p] base [%p] len %d, "
+                               "offset %d, len ["LPSZ"]\n", iov,
+                              iov->iov_base + offset, iov->iov_len, offset,
+                               iov->iov_len - offset);
+                       gm_bcopy(buffer, iov->iov_base + offset,
+                                 iov->iov_len - offset);
+                        offset = 0;
+                        buffer += iov->iov_len - offset;
+                } else {
+                       CDEBUG(D_INFO, "processing [%p] len ["LPSZ"]\n", iov,
+                              iov->iov_len);
+                       gm_bcopy(buffer, iov->iov_base, iov->iov_len);
+                       buffer += iov->iov_len;
+                }
+                iov++;
        }
 
 
@@ -343,7 +365,7 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
         *      let portals library know receive is complete
         */
        CDEBUG(D_PORTALS, "calling lib_finalize\n");
-       lib_finalize(nal_cb, private, cookie, PTL_OK);
+       lib_finalize(libnal, private, cookie, PTL_OK);
        /*
         *      return buffer so it can be used again
         */
@@ -365,11 +387,11 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
  *     The callback function informs when the send is complete.
  */
 int
-gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, 
+gmnal_small_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, 
                ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid, 
-               unsigned int niov, struct iovec *iov, int size)
+               unsigned int niov, struct iovec *iov, size_t offset, int size)
 {
-       gmnal_data_t    *nal_data = (gmnal_data_t*)nal_cb->nal_data;
+       gmnal_data_t    *nal_data = (gmnal_data_t*)libnal->libnal_data;
        gmnal_stxd_t    *stxd = NULL;
        void            *buffer = NULL;
        gmnal_msghdr_t  *msghdr = NULL;
@@ -377,9 +399,9 @@ gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
        unsigned int    local_nid;
        gm_status_t     gm_status = GM_SUCCESS;
 
-       CDEBUG(D_TRACE, "gmnal_small_tx nal_cb [%p] private [%p] cookie [%p] "
+       CDEBUG(D_TRACE, "gmnal_small_tx libnal [%p] private [%p] cookie [%p] "
               "hdr [%p] type [%d] global_nid ["LPU64"] pid [%d] niov [%d] "
-              "iov [%p] size [%d]\n", nal_cb, private, cookie, hdr, type, 
+              "iov [%p] size [%d]\n", libnal, private, cookie, hdr, type, 
               global_nid, pid, niov, iov, size);
 
        CDEBUG(D_INFO, "portals_hdr:: dest_nid ["LPU64"], src_nid ["LPU64"]\n",
@@ -428,11 +450,21 @@ gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
        buffer += sizeof(ptl_hdr_t);
 
        while(niov--) {
-               CDEBUG(D_INFO, "processing iov [%p] len ["LPSZ"] to [%p]\n", 
-                      iov, iov->iov_len, buffer);
-               gm_bcopy(iov->iov_base, buffer, iov->iov_len);
-               buffer+= iov->iov_len;
-               iov++;
+                if (offset >= iov->iov_len) {
+                        offset -= iov->iov_len;
+                } else if (offset > 0) {
+                       CDEBUG(D_INFO, "processing iov [%p] base [%p] len ["LPSZ"] to [%p]\n", 
+                               iov, iov->iov_base + offset, iov->iov_len - offset, buffer);
+                       gm_bcopy(iov->iov_base + offset, buffer, iov->iov_len - offset);
+                       buffer+= iov->iov_len - offset;
+                        offset = 0;
+                } else {
+                       CDEBUG(D_INFO, "processing iov [%p] len ["LPSZ"] to [%p]\n", 
+                               iov, iov->iov_len, buffer);
+                       gm_bcopy(iov->iov_base, buffer, iov->iov_len);
+                       buffer+= iov->iov_len;
+                } 
+                iov++;
        }
 
        CDEBUG(D_INFO, "sending\n");
@@ -472,7 +504,7 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status)
        gmnal_stxd_t    *stxd = (gmnal_stxd_t*)context;
        lib_msg_t       *cookie = stxd->cookie;
        gmnal_data_t    *nal_data = (gmnal_data_t*)stxd->nal_data;
-       nal_cb_t        *nal_cb = nal_data->nal_cb;
+       lib_nal_t       *libnal = nal_data->libnal;
 
        if (!stxd) {
                CDEBUG(D_TRACE, "send completion event for unknown stxd\n");
@@ -592,7 +624,7 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status)
                return;
        }
        gmnal_return_stxd(nal_data, stxd);
-       lib_finalize(nal_cb, stxd, cookie, PTL_OK);
+       lib_finalize(libnal, stxd, cookie, PTL_OK);
        return;
 }
 
@@ -645,9 +677,9 @@ void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context,
  *     this ack, deregister the memory. Only 1 send token is required here.
  */
 int
-gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, 
+gmnal_large_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, 
                ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid, 
-               unsigned int niov, struct iovec *iov, int size)
+               unsigned int niov, struct iovec *iov, size_t offset, int size)
 {
 
        gmnal_data_t    *nal_data;
@@ -661,15 +693,15 @@ gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
        int             niov_dup;
 
 
-       CDEBUG(D_TRACE, "gmnal_large_tx nal_cb [%p] private [%p], cookie [%p] "
+       CDEBUG(D_TRACE, "gmnal_large_tx libnal [%p] private [%p], cookie [%p] "
               "hdr [%p], type [%d] global_nid ["LPU64"], pid [%d], niov [%d], "
-              "iov [%p], size [%d]\n", nal_cb, private, cookie, hdr, type, 
+              "iov [%p], size [%d]\n", libnal, private, cookie, hdr, type, 
               global_nid, pid, niov, iov, size);
 
-       if (nal_cb)
-               nal_data = (gmnal_data_t*)nal_cb->nal_data;
+       if (libnal)
+               nal_data = (gmnal_data_t*)libnal->libnal_data;
        else  {
-               CDEBUG(D_ERROR, "no nal_cb.\n");
+               CDEBUG(D_ERROR, "no libnal.\n");
                return(GMNAL_STATUS_FAIL);
        }
        
@@ -712,30 +744,39 @@ gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
        mlen += sizeof(ptl_hdr_t); 
        CDEBUG(D_INFO, "mlen is [%d]\n", mlen);
 
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+        } 
+
+        LASSERT(offset >= 0);
+        /*
+        *      Store the iovs in the stxd for we can get 
+        *      them later if we need them
+        */
+        stxd->iov[0].iov_base = iov->iov_base + offset; 
+        stxd->iov[0].iov_len = iov->iov_len - offset; 
+       CDEBUG(D_NET, "Copying iov [%p] to [%p], niov=%d\n", iov, stxd->iov, niov);
+        if (niov > 1)
+               gm_bcopy(&iov[1], &stxd->iov[1], (niov-1)*sizeof(struct iovec));
+       stxd->niov = niov;
+
        /*
         *      copy the iov to the buffer so target knows 
         *      where to get the data from
         */
        CDEBUG(D_INFO, "processing iov to [%p]\n", buffer);
-       gm_bcopy(iov, buffer, niov*sizeof(struct iovec));
-       mlen += niov*(sizeof(struct iovec));
+       gm_bcopy(stxd->iov, buffer, stxd->niov*sizeof(struct iovec));
+       mlen += stxd->niov*(sizeof(struct iovec));
        CDEBUG(D_INFO, "mlen is [%d]\n", mlen);
-
-
-       /*
-        *      Store the iovs in the stxd for we can get 
-        *      them later if we need them
-        */
-       CDEBUG(D_NET, "Copying iov [%p] to [%p]\n", iov, stxd->iov);
-       gm_bcopy(iov, stxd->iov, niov*sizeof(struct iovec));
-       stxd->niov = niov;
        
-
        /*
         *      register the memory so the NIC can get hold of the data
         *      This is a slow process. it'd be good to overlap it 
         *      with something else.
         */
+        iov = stxd->iov;
        iov_dup = iov;
        niov_dup = niov;
        while(niov--) {
@@ -811,11 +852,11 @@ gmnal_large_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status)
  *     data from the sender.
  */
 int
-gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, 
-               unsigned int nriov, struct iovec *riov, size_t mlen
-               size_t rlen)
+gmnal_large_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, 
+               unsigned int nriov, struct iovec *riov, size_t offset
+               size_t mlen, size_t rlen)
 {
-       gmnal_data_t    *nal_data = nal_cb->nal_data;
+       gmnal_data_t    *nal_data = libnal->libnal_data;
        gmnal_srxd_t    *srxd = (gmnal_srxd_t*)private;
        void            *buffer = NULL;
        struct  iovec   *riov_dup;
@@ -823,13 +864,13 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
        gmnal_msghdr_t  *msghdr = NULL;
        gm_status_t     gm_status;
 
-       CDEBUG(D_TRACE, "gmnal_large_rx :: nal_cb[%p], private[%p], "
+       CDEBUG(D_TRACE, "gmnal_large_rx :: libnal[%p], private[%p], "
               "cookie[%p], niov[%d], iov[%p], mlen["LPSZ"], rlen["LPSZ"]\n",
-               nal_cb, private, cookie, nriov, riov, mlen, rlen);
+               libnal, private, cookie, nriov, riov, mlen, rlen);
 
        if (!srxd) {
                CDEBUG(D_ERROR, "gmnal_large_rx no context\n");
-               lib_finalize(nal_cb, private, cookie, PTL_FAIL);
+               lib_finalize(libnal, private, cookie, PTL_FAIL);
                return(PTL_FAIL);
        }
 
@@ -854,6 +895,25 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
         *      If the iovecs match, could interleave 
         *      gm_registers and gm_gets for each element
         */
+        while (offset >= riov->iov_len) {
+                offset -= riov->iov_len;
+                riov++;
+                nriov--;
+        } 
+        LASSERT (nriov >= 0);
+        LASSERT (offset >= 0);
+       /*
+        *      do this so the final gm_get callback can deregister the memory
+        */
+       PORTAL_ALLOC(srxd->riov, nriov*(sizeof(struct iovec)));
+
+        srxd->riov[0].iov_base = riov->iov_base + offset;
+        srxd->riov[0].iov_len = riov->iov_len - offset;
+        if (nriov > 1)
+               gm_bcopy(&riov[1], &srxd->riov[1], (nriov-1)*(sizeof(struct iovec)));
+       srxd->nriov = nriov;
+        
+        riov = srxd->riov;
        nriov_dup = nriov;
        riov_dup = riov;
        while(nriov--) {
@@ -879,17 +939,12 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
                        /*
                         *      give back srxd and buffer. Send NACK to sender
                         */
+                        PORTAL_FREE(srxd->riov, nriov_dup*(sizeof(struct iovec)));
                        return(PTL_FAIL);
                }
                GMNAL_GM_UNLOCK(nal_data);
                riov++;
        }
-       /*
-        *      do this so the final gm_get callback can deregister the memory
-        */
-       PORTAL_ALLOC(srxd->riov, nriov_dup*(sizeof(struct iovec)));
-       gm_bcopy(riov_dup, srxd->riov, nriov_dup*(sizeof(struct iovec)));
-       srxd->nriov = nriov_dup;
 
        /*
         *      now do gm_get to get the data
@@ -1092,7 +1147,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context,
 
        gmnal_ltxd_t    *ltxd = (gmnal_ltxd_t*)context;
        gmnal_srxd_t    *srxd = ltxd->srxd;
-       nal_cb_t        *nal_cb = srxd->nal_data->nal_cb;
+       lib_nal_t       *libnal = srxd->nal_data->libnal;
        int             lastone;
        struct  iovec   *riov;
        int             nriov;
@@ -1126,7 +1181,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context,
         *      Let our client application proceed
         */     
        CDEBUG(D_ERROR, "final callback context[%p]\n", srxd);
-       lib_finalize(nal_cb, srxd, srxd->cookie, PTL_OK);
+       lib_finalize(libnal, srxd, srxd->cookie, PTL_OK);
 
        /*
         *      send an ack to the sender to let him know we got the data
@@ -1276,7 +1331,7 @@ gmnal_large_tx_ack_callback(gm_port_t *gm_port, void *context,
 void 
 gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd)
 {
-       nal_cb_t        *nal_cb = nal_data->nal_cb;
+       lib_nal_t       *libnal = nal_data->libnal;
        gmnal_stxd_t    *stxd = NULL;
        gmnal_msghdr_t  *msghdr = NULL;
        void            *buffer = NULL;
@@ -1291,7 +1346,7 @@ gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd)
 
        CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd);
 
-       lib_finalize(nal_cb, stxd, stxd->cookie, PTL_OK);
+       lib_finalize(libnal, stxd, stxd->cookie, PTL_OK);
 
        /*
         *      extract the iovec from the stxd, deregister the memory.
index 31f6819..3aca90f 100644 (file)
@@ -32,9 +32,6 @@ int num_rx_threads = -1;
 int num_stxds = 5;
 int gm_port = 4;
 
-ptl_handle_ni_t        kgmnal_ni;
-
-
 int 
 gmnal_cmd(struct portals_cfg *pcfg, void *private)
 {
@@ -58,9 +55,15 @@ gmnal_cmd(struct portals_cfg *pcfg, void *private)
                copy_from_user(name, pcfg->pcfg_pbuf1, pcfg->pcfg_plen1);
        
                GMNAL_GM_LOCK(nal_data);
-               nid = gm_host_name_to_node_id(nal_data->gm_port, name);
+               //nid = gm_host_name_to_node_id(nal_data->gm_port, name);
+                gm_status = gm_host_name_to_node_id_ex (nal_data->gm_port, 0, name, &nid);
                GMNAL_GM_UNLOCK(nal_data);
-               CDEBUG(D_INFO, "Local node id is [%d]\n", nid);
+                if (gm_status != GM_SUCCESS) {
+                        CDEBUG(D_INFO, "gm_host_name_to_node_id_ex(...host %s) failed[%d]\n",
+                                name, gm_status);
+                        return (-1);
+                } else
+                       CDEBUG(D_INFO, "Local node %s id is [%d]\n", name, nid);
                GMNAL_GM_LOCK(nal_data);
                gm_status = gm_node_id_to_global_id(nal_data->gm_port, 
                                                    nid, &gnid);
@@ -90,28 +93,16 @@ gmnal_load(void)
        CDEBUG(D_TRACE, "This is the gmnal module initialisation routine\n");
 
 
-
        CDEBUG(D_INFO, "Calling gmnal_init\n");
-       status = PtlNIInit(gmnal_init, 32, 4, 0, &kgmnal_ni);
+        status = gmnal_init();
        if (status == PTL_OK) {
-               CDEBUG(D_INFO, "Portals GMNAL initialised ok kgmnal_ni\n");
+               CDEBUG(D_INFO, "Portals GMNAL initialised ok\n");
        } else {
                CDEBUG(D_INFO, "Portals GMNAL Failed to initialise\n");
-               return(1);
+               return(-ENODEV);
                
        }
 
-       CDEBUG(D_INFO, "Calling kportal_nal_register\n");
-       /*
-        *      global_nal_data is set by gmnal_init
-        */
-       if (kportal_nal_register(GMNAL, &gmnal_cmd, global_nal_data) != 0) {
-               CDEBUG(D_INFO, "kportal_nal_register failed\n");
-               return(1);
-       }
-
-       CDEBUG(D_INFO, "Calling PORTAL_SYMBOL_REGISTER\n");
-       PORTAL_SYMBOL_REGISTER(kgmnal_ni);
        CDEBUG(D_INFO, "This is the end of the gmnal init routine");
 
 
@@ -122,11 +113,7 @@ gmnal_load(void)
 static void __exit
 gmnal_unload(void)
 {
-
-       kportal_nal_unregister(GMNAL);
-       PORTAL_SYMBOL_UNREGISTER(kgmnal_ni);
        gmnal_fini();
-       global_nal_data = NULL;
        return;
 }
 
@@ -135,8 +122,6 @@ module_init(gmnal_load);
 
 module_exit(gmnal_unload);
 
-EXPORT_SYMBOL(kgmnal_ni);
-
 MODULE_PARM(gmnal_small_msg_size, "i");
 MODULE_PARM(num_rx_threads, "i");
 MODULE_PARM(num_stxds, "i");
diff --git a/lustre/portals/knals/ibnal/Makefile.in b/lustre/portals/knals/ibnal/Makefile.in
deleted file mode 100644 (file)
index e180b3e..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-MODULES := kibnal
-kibnal-objs := ibnal.o ibnal_cb.o
-
-EXTRA_PRE_CFLAGS := @IBCPPFLAGS@
-
-@INCLUDE_RULES@
diff --git a/lustre/portals/knals/ibnal/autoMakefile.am b/lustre/portals/knals/ibnal/autoMakefile.am
deleted file mode 100644 (file)
index ffe084c..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-if MODULES
-if !CRAY_PORTALS
-if BUILD_IBNAL
-modulenet_DATA = kibnal$(KMODEXT)
-endif
-endif
-endif
-
-MOSTLYCLEANFILES = *.o *.ko *.mod.c
-DIST_SOURCES = $(kibnal-objs:%.o=%.c) ibnal.h
diff --git a/lustre/portals/knals/ibnal/ibnal.c b/lustre/portals/knals/ibnal/ibnal.c
deleted file mode 100644 (file)
index 948badf..0000000
+++ /dev/null
@@ -1,2146 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Based on ksocknal, qswnal, and gmnal
- *
- * Copyright (C) 2003 LANL 
- *   Author: HB Chen <hbchen@lanl.gov>
- *   Los Alamos National Lab
- *
- *   Portals is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Portals is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Portals; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *   
- */
-
-#include "ibnal.h"
-
-// portal handle ID for this IB-NAL
-ptl_handle_ni_t kibnal_ni;
-
-// message send buffer mutex
-spinlock_t   MSBuf_mutex[NUM_MBUF];
-
-// message recv buffer mutex
-spinlock_t   MRBuf_mutex[NUM_MBUF];
-
-// IB-NAL API information 
-nal_t  kibnal_api; 
-
-// nal's private data 
-kibnal_data_t kibnal_data; 
-
-int ibnal_debug = 0;
-VAPI_pd_hndl_t      Pd_hndl;    
-unsigned int    Num_posted_recv_buf;
-
-// registered send buffer list
-Memory_buffer_info MSbuf_list[NUM_MBUF]; 
-
-// registered recv buffer list 
-Memory_buffer_info MRbuf_list[NUM_MBUF];
-
-//
-// for router 
-// currently there is no need fo IBA  
-//
-kpr_nal_interface_t kibnal_router_interface = {
-        kprni_nalid: IBNAL,
-        kprni_arg:   &kibnal_data,
-        kprni_fwd:   kibnal_fwd_packet, // forward data to router  
-                                        // is router invloving the
-                                        // data transmision 
-};
-
-
-// Queue-pair list 
-QP_info QP_list[NUM_QPS];
-
-// information associated with a HCA 
-HCA_info        Hca_data;
-
-// something about HCA 
-VAPI_hca_hndl_t      Hca_hndl; // assume we only use one HCA now 
-VAPI_hca_vendor_t    Hca_vendor;
-VAPI_hca_cap_t       Hca_cap;
-VAPI_hca_port_t      Hca_port_1_props;
-VAPI_hca_port_t      Hca_port_2_props;
-VAPI_hca_attr_t      Hca_attr;
-VAPI_hca_attr_mask_t Hca_attr_mask;
-VAPI_cq_hndl_t       Cq_RQ_hndl;    // CQ's handle
-VAPI_cq_hndl_t       Cq_SQ_hndl;    // CQ's handle
-VAPI_cq_hndl_t       Cq_hndl;    // CQ's handle
-Remote_QP_Info       L_QP_data;
-Remote_QP_Info       R_QP_data;
-
-
-//
-// forward  API
-//
-int 
-kibnal_forward(nal_t   *nal,
-               int     id,
-               void    *args,  
-               size_t args_len,
-               void    *ret,   
-               size_t ret_len)
-{
-        kibnal_data_t *knal_data = nal->nal_data;
-        nal_cb_t      *nal_cb = knal_data->kib_cb;
-
-        // ASSERT checking 
-        LASSERT (nal == &kibnal_api);
-        LASSERT (knal_data == &kibnal_data);
-        LASSERT (nal_cb == &kibnal_lib);
-
-        // dispatch forward API function 
-        
-        CDEBUG(D_NET,"kibnal_forward: function id = %d\n", id);
-
-        lib_dispatch(nal_cb, knal_data, id, args, ret); 
-
-        CDEBUG(D_TRACE,"IBNAL- Done kibnal_forward\n");
-
-        return PTL_OK; // always return PTL_OK
-}
-
-//
-// lock API  
-//
-void 
-kibnal_lock(nal_t *nal, unsigned long *flags)
-{
-        kibnal_data_t *knal_data = nal->nal_data;
-        nal_cb_t      *nal_cb = knal_data->kib_cb;
-
-        // ASSERT checking 
-        LASSERT (nal == &kibnal_api);
-        LASSERT (knal_data == &kibnal_data);
-        LASSERT (nal_cb == &kibnal_lib);
-
-        // disable logical interrrupt 
-        nal_cb->cb_cli(nal_cb,flags);
-
-        CDEBUG(D_TRACE,"IBNAL-Done kibnal_lock\n");
-
-}
-
-//
-// unlock API
-//
-void 
-kibnal_unlock(nal_t *nal, unsigned long *flags)
-{
-        kibnal_data_t *k = nal->nal_data;
-        nal_cb_t      *nal_cb = k->kib_cb;
-
-        // ASSERT checking
-        LASSERT (nal == &kibnal_api);
-        LASSERT (k == &kibnal_data);
-        LASSERT (nal_cb == &kibnal_lib);
-
-        // enable logical interrupt 
-        nal_cb->cb_sti(nal_cb,flags);
-
-        CDEBUG(D_TRACE,"IBNAL-Done kibnal_unlock");
-
-}
-
-//
-// shutdown API 
-//     showdown this network interface 
-//
-int
-kibnal_shutdown(nal_t *nal, int ni)
-{       
-        VAPI_ret_t          vstat;
-        kibnal_data_t *k = nal->nal_data;
-        nal_cb_t      *nal_cb = k->kib_cb;
-
-        // assert checking
-        LASSERT (nal == &kibnal_api);
-        LASSERT (k == &kibnal_data);
-        LASSERT (nal_cb == &kibnal_lib);
-
-        // take down this IB network interface 
-        // there is not corresponding cb function to hande this
-        // do we actually need this one 
-        // reference to IB network interface shutdown 
-        //
-        
-        vstat = IB_Close_HCA();
-
-        if (vstat != VAPI_OK) {
-           CERROR("Failed to close HCA  - %s\n",VAPI_strerror(vstat));
-           return (~PTL_OK);
-        }
-
-        CDEBUG(D_TRACE,"IBNAL- Done kibnal_shutdown\n");
-
-        return PTL_OK;
-}
-
-//
-// yield 
-// when do we call this yield function 
-//
-void 
-kibnal_yield( nal_t *nal )
-{
-        kibnal_data_t *k = nal->nal_data;
-        nal_cb_t      *nal_cb = k->kib_cb;
-        
-        // assert checking
-        LASSERT (nal == &kibnal_api);
-        LASSERT (k    == &kibnal_data);
-        LASSERT (nal_cb == &kibnal_lib);
-
-        // check under what condition that we need to 
-        // call schedule()
-        // who set this need_resched 
-        if (current->need_resched)
-                schedule();
-
-        CDEBUG(D_TRACE,"IBNAL-Done kibnal_yield");
-
-        return;
-}
-
-//
-// ibnal init 
-//
-nal_t *
-kibnal_init(int             interface, // no use here 
-            ptl_pt_index_t  ptl_size,
-            ptl_ac_index_t  ac_size, 
-            ptl_pid_t       requested_pid // no use here
-           )
-{
-  nal_t         *nal       = NULL;
-  nal_cb_t      *nal_cb    = NULL;
-  kibnal_data_t *nal_data  = NULL;
-  int            rc;
-
-  unsigned int nnids = 1; // number of nids 
-                          // do we know how many nodes are in this
-                          // system related to this kib_nid  
-                          //
-
-  CDEBUG(D_NET, "kibnal_init:calling lib_init with nid 0x%u\n",
-                  kibnal_data.kib_nid);
-
-
-  CDEBUG(D_NET, "kibnal_init: interface [%d], ptl_size [%d], ac_size[%d]\n", 
-                 interface, ptl_size, ac_size);
-  CDEBUG(D_NET, "kibnal_init: &kibnal_lib  0x%X\n", &kibnal_lib);
-  CDEBUG(D_NET, "kibnal_init: kibnal_data.kib_nid  %d\n", kibnal_data.kib_nid);
-
-  rc = lib_init(&kibnal_lib, 
-                kibnal_data.kib_nid, 
-                0, // process id is set as 0  
-                nnids,
-                ptl_size, 
-                ac_size);
-
-  if(rc != PTL_OK) {
-     CERROR("kibnal_init: Failed lib_init with nid 0x%u, rc=%d\n",
-                                  kibnal_data.kib_nid,rc);
-  }
-  else {
-      CDEBUG(D_NET,"kibnal_init: DONE lib_init with nid 0x%x%x\n",
-                                  kibnal_data.kib_nid);
-  }
-
-  return &kibnal_api;
-
-}
-
-
-//
-// called before remove ibnal kernel module 
-//
-void __exit 
-kibnal_finalize(void) 
-{ 
-        struct list_head *tmp;
-
-        inter_module_unregister("kibnal_ni");
-
-        // release resources allocated to this Infiniband network interface 
-        PtlNIFini(kibnal_ni); 
-
-        lib_fini(&kibnal_lib); 
-
-        IB_Close_HCA();
-
-        // how much do we need to do here?
-        list_for_each(tmp, &kibnal_data.kib_list) {
-                kibnal_rx_t *conn;
-                conn = list_entry(tmp, kibnal_rx_t, krx_item);
-                CDEBUG(D_IOCTL, "freeing conn %p\n",conn);
-                tmp = tmp->next;
-                list_del(&conn->krx_item);
-                PORTAL_FREE(conn, sizeof(*conn));
-        }
-
-        CDEBUG(D_MALLOC,"done kmem %d\n",atomic_read(&portal_kmemory));
-        CDEBUG(D_TRACE,"IBNAL-Done kibnal_finalize\n");
-
-        return;
-}
-
-
-//
-// * k_server_thread is a kernel thread 
-//   use a shared memory ro exchange HCA's data with a pthread in user 
-//   address space
-// * will be replaced when CM is used to handle communication management 
-//
-
-void k_server_thread(Remote_QP_Info *hca_data)
-{
-  int              segment_id;
-  const int        shared_segment_size = sizeof(Remote_QP_Info); 
-  key_t            key = HCA_EXCHANGE_SHM_KEY;
-  unsigned long    raddr;
-  int exchanged_done = NO;
-  int i;
-
-  Remote_QP_Info  *exchange_hca_data;
-
-  long *n;
-  long *uaddr;
-  long ret = 0;
-  // create a shared memory with pre-agreement key
-  segment_id =  sys_shmget(key,
-                           shared_segment_size,
-                           IPC_CREAT | 0666);
-
-
-  // attached to shared memoru 
-  // raddr is pointed to an user address space 
-  // use this address to update shared menory content 
-  ret = sys_shmat(segment_id, 0 , SHM_RND, &raddr);
-
-#ifdef IBNAL_DEBUG 
-  if(ret >= 0) {
-    CDEBUG(D_NET,"k_server_thread: Shared memory attach success ret = 0X%d,&raddr"
-                   " 0X%x (*(&raddr))=0x%x \n", ret, &raddr,  (*(&raddr)));
-    printk("k_server_thread: Shared memory attach success ret = 0X%d, &raddr"
-                   " 0X%x (*(&raddr))=0x%x \n", ret, &raddr,  (*(&raddr)));
-  }
-  else {
-    CERROR("k_server_thread: Shared memory attach failed ret = 0x%d \n", ret); 
-    printk("k_server_thread: Shared memory attach failed ret = 0x%d \n", ret); 
-    return;
-  }
-#endif
-
-  n = &raddr;
-  uaddr = *n; // get the U-address 
-  /* cast uaddr to exchange_hca_data */
-  exchange_hca_data = (Remote_QP_Info  *) uaddr; 
-  
-  /* copy data from local HCA to shared memory */
-  exchange_hca_data->opcode  = hca_data->opcode;
-  exchange_hca_data->length  = hca_data->length;
-
-  for(i=0; i < NUM_QPS; i++) {
-    exchange_hca_data->dlid[i]    = hca_data->dlid[i];
-    exchange_hca_data->rqp_num[i] = hca_data->rqp_num[i];
-  }
-
-  // periodically check shared memory until get updated 
-  // remote HCA's data from user mode pthread  
-  while(exchanged_done == NO) {
-    if(exchange_hca_data->opcode == RECV_QP_INFO){
-       exchanged_done = YES;
-       /* copy data to local buffer from shared memory */
-       hca_data->opcode  = exchange_hca_data->opcode;
-       hca_data->length  = exchange_hca_data->length;
-
-       for(i=0; i < NUM_QPS; i++) {
-         hca_data->dlid[i]    = exchange_hca_data->dlid[i];
-         hca_data->rqp_num[i] = exchange_hca_data->rqp_num[i];
-       }
-       break;
-    }
-    else { 
-       schedule_timeout(1000);
-    }
-  }
-  
-  // detached shared memory 
-  sys_shmdt(uaddr);
-
-  CDEBUG(D_NET, "Exit from kernel thread: k_server_thread \n");
-  printk("Exit from kernel thread: k_server_thread \n");
-
-  return;
-
-}
-
-//
-// create QP 
-// 
-VAPI_ret_t 
-create_qp(QP_info *qp, int qp_index)
-{
-
-  VAPI_ret_t          vstat;
-  VAPI_qp_init_attr_t qp_init_attr;
-  VAPI_qp_prop_t      qp_prop;
-
-  qp->hca_hndl = Hca_hndl;
-  qp->port     = 1; // default 
-  qp->slid     = Hca_port_1_props.lid;
-  qp->hca_port = Hca_port_1_props;
-
-
-  /* Queue Pair Creation Attributes */
-  qp_init_attr.cap.max_oust_wr_rq = NUM_WQE;
-  qp_init_attr.cap.max_oust_wr_sq = NUM_WQE;
-  qp_init_attr.cap.max_sg_size_rq = NUM_SG;
-  qp_init_attr.cap.max_sg_size_sq = NUM_SG;
-  qp_init_attr.pd_hndl            = qp->pd_hndl;
-  qp_init_attr.rdd_hndl           = 0;
-  qp_init_attr.rq_cq_hndl         = qp->rq_cq_hndl;
-  /* we use here polling */
-  //qp_init_attr.rq_sig_type        = VAPI_SIGNAL_REQ_WR;
-  qp_init_attr.rq_sig_type        = VAPI_SIGNAL_ALL_WR;
-  qp_init_attr.sq_cq_hndl         = qp->sq_cq_hndl;
-  /* we use here polling */
-  //qp_init_attr.sq_sig_type        = VAPI_SIGNAL_REQ_WR;
-  qp_init_attr.sq_sig_type        = VAPI_SIGNAL_ALL_WR;
-  // transport servce - reliable connection
-
-  qp_init_attr.ts_type            = VAPI_TS_RC;
-          
-  vstat = VAPI_create_qp(qp->hca_hndl,   
-                         &qp_init_attr,      
-                         &qp->qp_hndl, &qp_prop); 
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed creating QP. Return Failed - %s\n",VAPI_strerror(vstat));
-     return vstat;
-  }
-  
-  qp->qp_num = qp_prop.qp_num; // the qp number 
-  qp->last_posted_send_id  = 0; // user defined work request ID
-  qp->last_posted_rcv_id   = 0; // user defined work request ID
-  qp->cur_send_outstanding = 0;
-  qp->cur_posted_rcv_bufs  = 0;
-  qp->snd_rcv_balance      = 0;
-  
-  CDEBUG(D_OTHER, "create_qp: qp_num = %d, slid = %d, qp_hndl = 0X%X", 
-                  qp->qp_num, qp->slid, qp->qp_hndl);
-
-  // initialize spin-lock mutex variables
-  spin_lock_init(&(qp->snd_mutex));
-  spin_lock_init(&(qp->rcv_mutex));
-  spin_lock_init(&(qp->bl_mutex));
-  spin_lock_init(&(qp->cln_mutex));
-  // number of outstanding requests on the send Q
-  qp->cur_send_outstanding = 0; 
-  // number of posted receive buffers
-  qp->cur_posted_rcv_bufs  = 0;  
-  qp->snd_rcv_balance      = 0;
-
-  return(VAPI_OK);
-
-}
-
-//
-// initialize a UD qp state to RTR and RTS 
-//
-VAPI_ret_t 
-init_qp_UD(QP_info *qp, int qp_index)
-{
-  VAPI_qp_attr_t      qp_attr;
-  VAPI_qp_init_attr_t qp_init_attr;
-  VAPI_qp_attr_mask_t qp_attr_mask;
-  VAPI_qp_cap_t       qp_cap;
-  VAPI_ret_t       vstat;
-
-  /* Move from RST to INIT */
-  /* Change QP to INIT */
-
-  CDEBUG(D_OTHER, "Changing QP state to INIT qp-index = %d\n", qp_index);
-
-  QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
-
-  qp_attr.qp_state = VAPI_INIT;
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
-
-  CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.pkey_ix  = 0;
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
-
-  CDEBUG(D_OTHER, "pkey_ix qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.port     = qp->port;
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PORT);
-
-  CDEBUG(D_OTHER, "port qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.qkey = 0;
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QKEY);
-
-  CDEBUG(D_OTHER, "qkey qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  /* If I do not set this mask, I get an error from HH. QPM should catch it */
-
-  vstat = VAPI_modify_qp(qp->hca_hndl,
-                         qp->qp_hndl,
-                         &qp_attr,
-                         &qp_attr_mask,
-                         &qp_cap);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed modifying QP from RST to INIT. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-
-  CDEBUG(D_OTHER, "Modifying QP from RST to INIT.\n");
-
-  vstat= VAPI_query_qp(qp->hca_hndl,
-                       qp->qp_hndl,
-                       &qp_attr,
-                       &qp_attr_mask,
-                       &qp_init_attr);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-
-  /* Move from INIT to RTR */
-  /* Change QP to RTR */
-  CDEBUG(D_OTHER, "Changing QP state to RTR\n");
-
-  QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
-
-  qp_attr.qp_state         = VAPI_RTR;  
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
-
-  CDEBUG(D_OTHER, "INIT to RTR- qp_state : qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  vstat = VAPI_modify_qp(qp->hca_hndl,
-                         qp->qp_hndl,
-                         &qp_attr,
-                         &qp_attr_mask,
-                         &qp_cap);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed modifying QP from INIT to RTR. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-  
-  CDEBUG(D_OTHER, "Modifying QP from INIT to RTR.\n");
-  
-  vstat= VAPI_query_qp(qp->hca_hndl,
-                       qp->qp_hndl,
-                       &qp_attr,
-                       &qp_attr_mask,
-                       &qp_init_attr);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-                                      
-  /* RTR to RTS - Change QP to RTS */
-  CDEBUG(D_OTHER, "Changing QP state to RTS\n");
-
-  QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
-
-  qp_attr.qp_state        = VAPI_RTS;   
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
-  
-  qp_attr.sq_psn          = START_SQ_PSN;          
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_SQ_PSN);
-  
-  vstat = VAPI_modify_qp(qp->hca_hndl,
-                         qp->qp_hndl,
-                         &qp_attr,
-                         &qp_attr_mask,
-                         &qp_cap);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed modifying QP from RTR to RTS. %s:%s\n",
-                          VAPI_strerror_sym(vstat), 
-                          VAPI_strerror(vstat));
-     return(vstat);
-  }
-
-  CDEBUG(D_OTHER, "Modifying QP from RTR to RTS. \n");
-                     
-  vstat= VAPI_query_qp(qp->hca_hndl,
-                       qp->qp_hndl,
-                       &qp_attr,
-                       &qp_attr_mask,
-                       &qp_init_attr);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-                        
-  //
-  // a QP is at RTS state NOW
-  //
-  CDEBUG(D_OTHER, "IBNAL- UD qp is at RTS NOW\n");
-  
-  return(vstat);
-
-}
-
-
-
-//
-// initialize a RC qp state to RTR and RTS 
-// RC transport service 
-//
-VAPI_ret_t 
-init_qp_RC(QP_info *qp, int qp_index)
-{
-  VAPI_qp_attr_t      qp_attr;
-  VAPI_qp_init_attr_t qp_init_attr;
-  VAPI_qp_attr_mask_t qp_attr_mask;
-  VAPI_qp_cap_t       qp_cap;
-  VAPI_ret_t       vstat;
-
-  /* Move from RST to INIT */
-  /* Change QP to INIT */
-  
-  CDEBUG(D_OTHER, "Changing QP state to INIT qp-index = %d\n", qp_index);
-
-  QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
-
-  qp_attr.qp_state = VAPI_INIT;
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
-
-   CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.pkey_ix  = 0;
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
-
-  CDEBUG(D_OTHER, "pkey_ix qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.port     = qp->port;
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PORT);
-
-  CDEBUG(D_OTHER, "port qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.remote_atomic_flags = VAPI_EN_REM_WRITE | VAPI_EN_REM_READ;
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_REMOTE_ATOMIC_FLAGS);
-
-  CDEBUG(D_OTHER, "remote_atomic_flags qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  /* If I do not set this mask, I get an error from HH. QPM should catch it */
-
-  vstat = VAPI_modify_qp(qp->hca_hndl,
-                         qp->qp_hndl,
-                         &qp_attr,
-                         &qp_attr_mask,
-                         &qp_cap);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed modifying QP from RST to INIT. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-
-  vstat= VAPI_query_qp(qp->hca_hndl,
-                       qp->qp_hndl,
-                       &qp_attr,
-                       &qp_attr_mask,
-                       &qp_init_attr);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-
-  /* Move from INIT to RTR */
-  /* Change QP to RTR */
-  CDEBUG(D_OTHER, "Changing QP state to RTR qp_indexi %d\n", qp_index);
-
-  QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
-  qp_attr.qp_state         = VAPI_RTR;  
-
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
-
-  CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.av.sl            = 0;/* RESPONDER_SL */
-  qp_attr.av.grh_flag      = FALSE;
-  qp_attr.av.dlid          = qp->dlid;/*RESPONDER_LID;*/
-  qp_attr.av.static_rate   = 0;
-  qp_attr.av.src_path_bits = 0;              
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_AV);
-
-  CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.path_mtu         = MTU_2048;// default is MTU_2048             
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PATH_MTU);
-
-  CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.rq_psn           = START_RQ_PSN;              
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RQ_PSN);
-
-  CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.qp_ous_rd_atom   = NUM_WQE;        
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_OUS_RD_ATOM);
-
-  CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.pkey_ix          = 0;              
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
-
-  CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.min_rnr_timer    = 10;              
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_MIN_RNR_TIMER);
-
-  CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  qp_attr.dest_qp_num = qp->rqp_num;                   
-
-  CDEBUG(D_OTHER, "remore qp num %d\n",  qp->rqp_num);
-
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_DEST_QP_NUM);
-
-  CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
-
-  vstat = VAPI_modify_qp(qp->hca_hndl,
-                         qp->qp_hndl,
-                         &qp_attr,
-                         &qp_attr_mask,
-                         &qp_cap);
-
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed modifying QP from INIT to RTR. qp_index %d - %s\n",
-                                                qp_index, VAPI_strerror(vstat));
-     return(vstat);
-  }
-  
-  vstat= VAPI_query_qp(qp->hca_hndl,
-                       qp->qp_hndl,
-                       &qp_attr,
-                       &qp_attr_mask,
-                       &qp_init_attr);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-                                      
-  /* RTR to RTS - Change QP to RTS */
-  CDEBUG(D_OTHER, "Changing QP state to RTS\n");
-
-  QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
-
-  qp_attr.qp_state        = VAPI_RTS;   
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
-
-  qp_attr.sq_psn          = START_SQ_PSN;          
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_SQ_PSN);
-
-  qp_attr.timeout         = 0x18;         
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_TIMEOUT);
-
-  qp_attr.retry_count     = 10;         
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RETRY_COUNT);
-
-  qp_attr.rnr_retry       = 14;         
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RNR_RETRY);
-
-  qp_attr.ous_dst_rd_atom = 100;        
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_OUS_DST_RD_ATOM);
-
-  qp_attr.min_rnr_timer   = 5;          
-  QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_MIN_RNR_TIMER);
-
-  vstat = VAPI_modify_qp(qp->hca_hndl,
-                         qp->qp_hndl,
-                         &qp_attr,
-                         &qp_attr_mask,
-                         &qp_cap);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed modifying QP from RTR to RTS. %s:%s\n",
-                   VAPI_strerror_sym(vstat), VAPI_strerror(vstat));
-     return(vstat);
-  }
-
-  vstat= VAPI_query_qp(qp->hca_hndl,
-                       qp->qp_hndl,
-                       &qp_attr,
-                       &qp_attr_mask,
-                       &qp_init_attr);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-                        
-  //
-  // a QP is at RTS state NOW
-  //
-   CDEBUG(D_OTHER, "IBNAL- RC qp is at RTS NOW\n");
-  
-  return(vstat);
-}
-
-
-
-VAPI_ret_t 
-IB_Open_HCA(kibnal_data_t *kib_data)
-{
-
-  VAPI_ret_t     vstat;
-  VAPI_cqe_num_t cqe_active_num;
-  QP_info        *qp; 
-  int            i;
-  int            Num_posted_recv_buf;
-
-  /* Open HCA */
-  CDEBUG(D_PORTALS, "Opening an HCA\n");
-
-  vstat = VAPI_open_hca(HCA_ID, &Hca_hndl);
-  vstat = EVAPI_get_hca_hndl(HCA_ID, &Hca_hndl);
-  if (vstat != VAPI_OK) {
-     CERROR("Failed opening the HCA: %s. %s...\n",HCA_ID,VAPI_strerror(vstat));
-     return(vstat);
-  } 
-
-  /* Get HCA CAP */
-  vstat = VAPI_query_hca_cap(Hca_hndl, &Hca_vendor, &Hca_cap);
-  if (vstat != VAPI_OK) {
-     CERROR("Failed query hca cap %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-
-  /* Get port 1 info */
-  vstat = VAPI_query_hca_port_prop(Hca_hndl, HCA_PORT_1 , &Hca_port_1_props);
-  if (vstat != VAPI_OK) {
-     CERROR("Failed query port cap %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }      
-
-  /* Get port 2 info */
-  vstat = VAPI_query_hca_port_prop(Hca_hndl, HCA_PORT_2, &Hca_port_2_props);
-  if (vstat != VAPI_OK) {
-     CERROR("Failed query port cap %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }      
-
-  // Get a PD 
-  CDEBUG(D_PORTALS, "Allocating PD \n");
-  vstat = VAPI_alloc_pd(Hca_hndl,&Pd_hndl);
-  if (vstat != VAPI_OK) {
-     CERROR("Failed allocating a PD. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-
-  vstat = createMemRegion(Hca_hndl, Pd_hndl);
-  if (vstat != VAPI_OK) {
-     CERROR("Failed registering a memory region.%s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-
-  /* Create CQ for RQ*/
-  CDEBUG(D_PORTALS, "Creating a send completion queue\n");
-
-  vstat = VAPI_create_cq(Hca_hndl,    
-                         NUM_CQE,    
-                         &Cq_hndl, 
-                         &cqe_active_num);
-
-  if (vstat != VAPI_OK) {
-     CERROR("Failed creating a CQ. %s\n",VAPI_strerror(vstat));
-     return(vstat);
-  }
-
-  if(NUM_CQE == cqe_active_num) {
-    CERROR("VAPI_create_cq: NUM_CQE EQ cqe_active_num \n");
-  }
-  else {
-    CDEBUG(D_NET, "VAPI_create_cq: NUM_CQE %d , actual cqe_active_num %d \n",
-                   NUM_CQE, cqe_active_num);
-  }
-
-  Cq_SQ_hndl     = Cq_hndl;
-  Cq_RQ_hndl     = Cq_hndl;
-
-  //
-  // create  QPs 
-  //
-  for(i=0; i < NUM_QPS; i++) {
-      QP_list[i].pd_hndl    = Pd_hndl;
-      QP_list[i].hca_hndl   = Hca_hndl;
-      // sq rq use the same Cq_hndl 
-      QP_list[i].sq_cq_hndl = Cq_hndl; 
-      QP_list[i].rq_cq_hndl = Cq_hndl;
-      vstat = create_qp(&QP_list[i], i);
-      if (vstat != VAPI_OK) {
-         CERROR("Failed creating a QP %d %s\n",i, VAPI_strerror(vstat));
-         return(vstat);
-      }
-  }      
-
-  //
-  // record HCA data 
-  //
-
-  Hca_data.hca_hndl     = Hca_hndl;      // HCA handle
-  Hca_data.pd_hndl      = Pd_hndl;       // protection domain
-  Hca_data.port         = 1;             // port number
-  Hca_data.num_qp       = NUM_QPS;        // number of qp used
-
-  for(i=0; i < NUM_QPS; i++) {
-    Hca_data.qp_ptr[i]    = &QP_list[i];   // point to QP_list
-  }
-
-  Hca_data.num_cq       = NUM_CQ;        // number of cq used
-  Hca_data.cq_hndl      = Cq_hndl;       // 
-  Hca_data.sq_cq_hndl   = Cq_SQ_hndl;    // 
-  Hca_data.rq_cq_hndl   = Cq_RQ_hndl;    // 
-  Hca_data.kib_data     = kib_data;       //
-  Hca_data.slid         = QP_list[0].slid;//
-
-  // prepare L_QP_data
-
-#ifdef USE_SHARED_MEMORY_AND_SOCKET
-
-  /*
-   *  + use a shared-memory between a user thread and a kernel thread 
-   *    for HCA's data exchange on the same node  
-   *  + use socket in user mode to exhange HCA's data with a remote node 
-   */
-
-  
-  R_QP_data.opcode  = SEND_QP_INFO;
-  R_QP_data.length  = sizeof(L_QP_data);
-
-  for(i=0; i < NUM_QPS; i++) {
-    // my slid  will be used in a remote node as dlid 
-    R_QP_data.dlid[i]    = QP_list[i].slid;
-    // my qp_num will be used in remode node as remote_qp_number 
-    // RC is used here so we need dlid and rqp_num  
-    R_QP_data.rqp_num[i] = QP_list[i].qp_num ;
-  }
-
-  // create a kernel thread for exchanging HCA's data 
-  // R_QP_data will be exchanged with a remoe node
-
-  kernel_thread(k_server_thread, &R_QP_data, 0); // 
-  // check if the HCA'data have been updated by kernel_thread 
-  // loop until the HCA's data is updated 
-  // make sure that uagent is running 
-  
-  // QP info is exchanged with a remote node   
-  while (1) {
-    schedule_timeout(1000);
-    if(R_QP_data.opcode ==  RECV_QP_INFO) {
-       CDEBUG(D_NET, "HCA's data is being updated\n");
-       break;
-   }
-  }
-#endif
-
-#ifdef USE_SHARED_MEMORY_AND_MULTICAST
-
-  /*
-   *  + use a shared-memory between a user thread and a kernel thread 
-   *    for HCA's data exchange on the same node  
-   *  + use Infinoband UR/multicast in user mode to exhange HCA's data with i
-   *    a remote node 
-   */
-
-  // use CM, opemSM 
-  
-#endif
-
-  // 
-  for(i=0; i < NUM_QPS; i++) {
-     qp = (QP_info *) &QP_list[i];
-     QP_list[i].rqp_num = R_QP_data.rqp_num[i]; // remoter qp number 
-     QP_list[i].dlid    = R_QP_data.dlid[i];    // remote dlid 
-  }
-
-  // already have remote_qp_num adn dlid information
-  // initialize QP to RTR/RTS state 
-  //
-  for(i=0; i < NUM_QPS; i++) {
-    vstat = init_qp_RC(&QP_list[i], i);
-    if (vstat != VAPI_OK) {
-       CERROR("Failed change a QP %d to RTS state%s\n",
-                    i,VAPI_strerror(vstat));
-       return(vstat);
-    }
-  }
-
-  // post receiving buffer before any send happened 
-  
-  Num_posted_recv_buf = post_recv_bufs( (VAPI_wr_id_t ) START_RECV_WRQ_ID); 
-
-  // for irregular completion event or some unexpected failure event 
-  vstat = IB_Set_Async_Event_Handler(Hca_data, &kibnal_data);
-  if (vstat != VAPI_OK) {
-     CERROR("IB_Set_Async_Event_Handler failed: %d\n", vstat);
-     return vstat;
-  }
-
-
-  CDEBUG(D_PORTALS, "IBNAL- done with IB_Open_HCA\n");
-
-  for(i=0;  i < NUM_MBUF; i++) {
-    spin_lock_init(&MSB_mutex[i]);
-  }
-
-  return(VAPI_OK);
-
-}
-
-
-/* 
-  Function:  IB_Set_Event_Handler()
-             
-             IN   Hca_info hca_data
-             IN   kibnal_data_t *kib_data  -- private data      
-             OUT  NONE
-
-        return: VAPI_OK - success
-                else    - fail 
-
-*/
-
-VAPI_ret_t 
-IB_Set_Event_Handler(HCA_info hca_data, kibnal_data_t *kib_data)
-{
-  VAPI_ret_t vstat;
-  EVAPI_compl_handler_hndl_t   comp_handler_hndl;
-
-  // register CQE_Event_Hnadler 
-  // VAPI function 
-  vstat = VAPI_set_comp_event_handler(hca_data.hca_hndl,
-                                      CQE_event_handler,
-                                      &hca_data);
-
-  /*
-  or use extended VAPI function 
-  vstat = EVAPI_set_comp_eventh(hca_data.hca_hndl,
-                                hca_data.cq_hndl,
-                                CQE_event_handler,
-                                &hca_data,
-                                &comp_handler_hndl
-                                );
-  */
-                                    
-  if (vstat != VAPI_OK) {
-      CERROR("IB_Set_Event_Handler: failed EVAPI_set_comp_eventh for"
-             " HCA ID = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
-      return vstat;
-  }
-
-  // issue a request for completion ievent notification 
-  vstat = VAPI_req_comp_notif(hca_data.hca_hndl, 
-                              hca_data.cq_hndl,
-                              VAPI_NEXT_COMP); 
-
-  if (vstat != VAPI_OK) {
-      CERROR("IB_Set_Event_Handler: failed VAPI_req_comp_notif for HCA ID"
-             " = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
-  }
-
-  return vstat;
-}
-
-
-
-/* 
-  Function:  IB_Set_Async_Event_Handler()
-             
-             IN   HCA_info hca_data
-             IN   kibnal_data_t *kib_data -- private data      
-             OUT  NONE
-
-        return: VAPI_OK - success
-                else    - fail 
-
-*/
-
-
-VAPI_ret_t 
-IB_Set_Async_Event_Handler(HCA_info hca_data, kibnal_data_t *kib_data)
-{
-  VAPI_ret_t    vstat;
-
-  //
-  // register an asynchronous event handler for this HCA 
-  //
-
-  vstat= VAPI_set_async_event_handler(hca_data.hca_hndl,
-                                      async_event_handler, 
-                                      kib_data);
-
-  if (vstat != VAPI_OK) {
-      CERROR("IB_Set_Async_Event_Handler: failed VAPI_set_async_comp_event_handler"
-             " for HCA ID = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
-  }
-
-  return vstat;
-}
-
-//
-// IB_Close_HCA
-// close this Infiniband HCA interface 
-// release allocated resources to system 
-//
-VAPI_ret_t 
-IB_Close_HCA(void )
-{
-        
-  VAPI_ret_t  vstat;
-  int         ok = 1;
-  int         i;
-            
-  /* Destroy QP */
-  CDEBUG(D_PORTALS, "Destroying QP\n");
-
-  for(i=0; i < NUM_QPS; i++) {
-     vstat = VAPI_destroy_qp(QP_list[i].hca_hndl, QP_list[i].qp_hndl);
-     if (vstat != VAPI_OK) {
-        CERROR("Failed destroying QP %d. %s\n", i, VAPI_strerror(vstat));
-        ok = 0;
-     }
-  }
-
-  if (ok) {
-     /* Destroy CQ */
-     CDEBUG(D_PORTALS, "Destroying CQ\n");
-     for(i=0; i < NUM_QPS; i++) {
-        // send_cq adn receive_cq are shared the same CQ
-        // so only destroy one of them 
-        vstat = VAPI_destroy_cq(QP_list[i].hca_hndl, QP_list[i].sq_cq_hndl);
-        if (vstat != VAPI_OK) {
-           CERROR("Failed destroying CQ %d. %s\n", i, VAPI_strerror(vstat));
-           ok = 0;
-        }
-     }
-  }
-
-  if (ok) {
-     /* Destroy Memory Region */
-     CDEBUG(D_PORTALS, "Deregistering MR\n");
-     for(i=0; i < NUM_QPS; i++) {
-        vstat = deleteMemRegion(&QP_list[i], i);
-        if (vstat != VAPI_OK) {
-           CERROR("Failed deregister mem reg %d. %s\n",i, VAPI_strerror(vstat));
-           ok = 0;
-           break;
-        }
-     }
-  }
-
-  if (ok) {
-     // finally 
-     /* Close HCA */
-     CDEBUG(D_PORTALS, "Closing HCA\n");
-     vstat = VAPI_close_hca(Hca_hndl);
-     if (vstat != VAPI_OK) {
-        CERROR("Failed to close HCA. %s\n", VAPI_strerror(vstat));
-        ok = 0;
-     }
-  }
-
-  CDEBUG(D_PORTALS, "IBNAL- Done with closing HCA \n");
-  
-  return vstat; 
-}
-
-
-VAPI_ret_t 
-createMemRegion(VAPI_hca_hndl_t hca_hndl, 
-                   VAPI_pd_hndl_t  pd_hndl) 
-{
-  VAPI_ret_t  vstat;
-  VAPI_mrw_t  mrw;
-  VAPI_mrw_t  rep_mr;   
-  VAPI_mr_hndl_t   rep_mr_hndl;
-  int         buf_size;
-  char        *bufptr;
-  int         i;
-
-  // send registered memory region 
-  for(i=0; i < NUM_ENTRY; i++) {
-    MSbuf_list[i].buf_size = KB_32; 
-    PORTAL_ALLOC(bufptr, MSbuf_list[i].buf_size);
-    if(bufptr == NULL) {
-       CDEBUG(D_MALLOC,"Failed to malloc a block of send memory, qix %d size %d\n",
-                                          i, MSbuf_list[i].buf_size);
-       CERROR("Failed to malloc a block of send memory, qix %d size %d\n",
-                                          i, MSbuf_list[i].buf_size);
-       return(VAPI_ENOMEM);
-    }
-
-    mrw.type   = VAPI_MR; 
-    mrw.pd_hndl= pd_hndl;
-    mrw.start  = MSbuf_list[i].buf_addr = (VAPI_virt_addr_t)(MT_virt_addr_t) bufptr;
-    mrw.size   = MSbuf_list[i].buf_size;
-    mrw.acl    = VAPI_EN_LOCAL_WRITE  | 
-                 VAPI_EN_REMOTE_WRITE | 
-                 VAPI_EN_REMOTE_READ;
-
-    // register send memory region  
-    vstat = VAPI_register_mr(hca_hndl, 
-                             &mrw, 
-                             &rep_mr_hndl, 
-                             &rep_mr);
-
-    // this memory region is going to be reused until deregister is called 
-    if(vstat != VAPI_OK) {
-       CERROR("Failed registering a mem region qix %d Addr=%p, Len=%d. %s\n",
-                          i, mrw.start, mrw.size, VAPI_strerror(vstat));
-       return(vstat);
-    }
-
-    MSbuf_list[i].mr        = rep_mr;
-    MSbuf_list[i].mr_hndl   = rep_mr_hndl;
-    MSbuf_list[i].bufptr    = bufptr;
-    MSbuf_list[i].buf_addr  = rep_mr.start;
-    MSbuf_list[i].status    = BUF_REGISTERED;
-    MSbuf_list[i].ref_count = 0;
-    MSbuf_list[i].buf_type  = REG_BUF;
-    MSbuf_list[i].raddr     = 0x0;
-    MSbuf_list[i].rkey      = 0x0;
-  }
-
-  // RDAM buffer is not reserved for RDAM WRITE/READ
-  
-  for(i=NUM_ENTRY; i< NUM_MBUF; i++) {
-    MSbuf_list[i].status    = BUF_UNREGISTERED;
-    MSbuf_list[i].buf_type  = RDMA_BUF;
-  }
-
-
-  // recv registered memory region 
-  for(i=0; i < NUM_ENTRY; i++) {
-    MRbuf_list[i].buf_size = KB_32; 
-    PORTAL_ALLOC(bufptr, MRbuf_list[i].buf_size);
-
-    if(bufptr == NULL) {
-       CDEBUG(D_MALLOC, "Failed to malloc a block of send memory, qix %d size %d\n",
-                      i, MRbuf_list[i].buf_size);
-       return(VAPI_ENOMEM);
-    }
-
-    mrw.type   = VAPI_MR; 
-    mrw.pd_hndl= pd_hndl;
-    mrw.start  = (VAPI_virt_addr_t)(MT_virt_addr_t) bufptr;
-    mrw.size   = MRbuf_list[i].buf_size;
-    mrw.acl    = VAPI_EN_LOCAL_WRITE  | 
-                 VAPI_EN_REMOTE_WRITE | 
-                 VAPI_EN_REMOTE_READ;
-
-    // register send memory region  
-    vstat = VAPI_register_mr(hca_hndl, 
-                             &mrw, 
-                             &rep_mr_hndl, 
-                             &rep_mr);
-
-    // this memory region is going to be reused until deregister is called 
-    if(vstat != VAPI_OK) {
-       CERROR("Failed registering a mem region qix %d Addr=%p, Len=%d. %s\n",
-                          i, mrw.start, mrw.size, VAPI_strerror(vstat));
-       return(vstat);
-    }
-
-    MRbuf_list[i].mr        = rep_mr;
-    MRbuf_list[i].mr_hndl   = rep_mr_hndl;
-    MRbuf_list[i].bufptr    = bufptr;
-    MRbuf_list[i].buf_addr  = rep_mr.start;
-    MRbuf_list[i].status    = BUF_REGISTERED;
-    MRbuf_list[i].ref_count = 0;
-    MRbuf_list[i].buf_type  = REG_BUF;
-    MRbuf_list[i].raddr     = 0x0;
-    MRbuf_list[i].rkey      = rep_mr.r_key;
-    MRbuf_list[i].lkey      = rep_mr.l_key;
-  
-  }
-  // keep extra information for a qp 
-  for(i=0; i < NUM_QPS; i++) {
-    QP_list[i].mr_hndl    = MSbuf_list[i].mr_hndl; 
-    QP_list[i].mr         = MSbuf_list[i].mr;
-    QP_list[i].bufptr     = MSbuf_list[i].bufptr;
-    QP_list[i].buf_addr   = MSbuf_list[i].buf_addr;
-    QP_list[i].buf_size   = MSbuf_list[i].buf_size;
-    QP_list[i].raddr      = MSbuf_list[i].raddr;
-    QP_list[i].rkey       = MSbuf_list[i].rkey;
-    QP_list[i].lkey       = MSbuf_list[i].lkey;
-  }
-
-  CDEBUG(D_PORTALS, "IBNAL- done VAPI_ret_t createMemRegion \n");
-
-  return vstat;
-
-} /* createMemRegion */
-
-
-
-VAPI_ret_t  
-deleteMemRegion(QP_info *qp, int qix)
-{
-  VAPI_ret_t  vstat;
-
-  //
-  // free send memory assocaited with this memory region  
-  //
-  PORTAL_FREE(MSbuf_list[qix].bufptr, MSbuf_list[qix].buf_size);
-
-  // de-register it 
-  vstat =  VAPI_deregister_mr(qp->hca_hndl, MSbuf_list[qix].mr_hndl);
-
-  if(vstat != VAPI_OK) {
-     CERROR("Failed deregistering a send mem region qix %d %s\n",
-                         qix, VAPI_strerror(vstat));
-     return vstat;
-  }
-
-  //
-  // free recv memory assocaited with this memory region  
-  //
-  PORTAL_FREE(MRbuf_list[qix].bufptr, MRbuf_list[qix].buf_size);
-
-  // de-register it 
-  vstat =  VAPI_deregister_mr(qp->hca_hndl, MRbuf_list[qix].mr_hndl);
-
-  if(vstat != VAPI_OK) {
-     CERROR("Failed deregistering a recv mem region qix %d %s\n",
-                         qix, VAPI_strerror(vstat));
-     return vstat;
-  }
-
-  return vstat;
-}
-
-
-//
-// polling based event handling 
-// + a daemon process
-// + poll the CQ and check what is in the CQ 
-// + process incoming CQ event
-// + 
-//
-
-
-RDMA_Info_Exchange   Rdma_info;
-int                  Cts_Message_arrived = NO;
-
-void k_recv_thread(HCA_info *hca_data)
-{
- VAPI_ret_t       vstat; 
- VAPI_wc_desc_t   comp_desc;   
- unsigned long    polling_count = 0;
- u_int32_t        timeout_usec;
- unsigned int     priority = 100;
- unsigned int     length;
- VAPI_wr_id_t     wrq_id;
- u_int32_t        transferred_data_length; /* Num. of bytes transferred */
- void             *bufdata;
- VAPI_virt_addr_t bufaddr;
- unsigned long    buf_size = 0;
- QP_info          *qp;       // point to QP_list
-
- kportal_daemonize("k_recv_thread"); // make it as a daemon process 
-
- // tuning variable 
- timeout_usec = 100; // how is the impact on the performance
-
- // send Q and receive Q are using the same CQ 
- // so only poll one CQ for both operations 
- CDEBUG(D_NET, "IBNAL- enter kibnal_recv_thread\n");
- CDEBUG(D_NET, "hca_hndl = 0X%x, cq_hndl=0X%x\n", 
-                         hca_data->hca_hndl,hca_data->cq_hndl); 
-
- qp = hca_data->qp_ptr;
- if(qp == NULL) {
-   CDEBUG(D_NET, "in recv_thread qp is NULL\n");
-   CDEBUG(D_NET, "Exit from  recv_thread qp is NULL\n");
-   return; 
- }
- else {
-   CDEBUG(D_NET, "in recv_thread qp is 0X%X\n", qp);
- }
-
- CDEBUG(D_NET, "kibnal_recv_thread - enter event driver polling loop\n");
-
- //
- // use event driver 
- //
-
-
- while(1) {
-    polling_count++;
-
-    //
-    // send Q and receive Q are using the same CQ 
-    // so only poll one CQ for both operations 
-    //
-
-    vstat = VAPI_poll_cq(hca_data->hca_hndl,hca_data->cq_hndl, &comp_desc);                      
-
-    if (vstat == VAPI_CQ_EMPTY) { 
-      // there is no event in CQE 
-      continue;
-    } 
-    else {
-      if (vstat != (VAPI_OK)) {
-        CERROR("error while polling completion queuei vstat %d \n", vstat);
-        return; 
-      }
-    }
-
-    // process the complete event 
-    switch(comp_desc.opcode) {
-      case   VAPI_CQE_SQ_SEND_DATA:
-        // about the Send Q ,POST SEND completion 
-        // who needs this information
-        // get wrq_id
-        // mark MSbuf_list[wr_id].status = BUF_REGISTERED 
-               
-        wrq_id = comp_desc.id;
-
-        if(RDMA_OP_ID < wrq_id) {
-          // this RDMA message id, adjust it to the right entry       
-          wrq_id = wrq_id - RDMA_OP_ID;
-          vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.send_rdma_mr_hndl);
-        }
-        
-        if(vstat != VAPI_OK) {
-            CERROR("VAPI_CQE_SQ_SEND_DATA: Failed deregistering a RDMAi recv"                   " mem region %s\n", VAPI_strerror(vstat));
-        }
-
-        if((RDMA_CTS_ID <= wrq_id) && (RDMA_OP_ID < wrq_id)) {
-          // RTS or CTS send complete, release send buffer 
-          if(wrq_id >= RDMA_RTS_ID)
-            wrq_id = wrq_id - RDMA_RTS_ID;
-          else 
-            wrq_id = wrq_id - RDMA_CTS_ID;
-        }
-
-        spin_lock(&MSB_mutex[(int) wrq_id]);
-        MRbuf_list[wrq_id].status = BUF_REGISTERED; 
-        spin_unlock(&MSB_mutex[(int) wrq_id]);
-
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_SEND_DATA\n");  
-        break;
-
-      case   VAPI_CQE_SQ_RDMA_WRITE:
-        // about the Send Q,  RDMA write completion 
-        // who needs this information
-        // data is successfully write from pource to  destionation 
-             
-        //  get wr_id
-        //  mark MSbuf_list[wr_id].status = BUF_REGISTERED 
-        //  de-register  rdma buffer 
-        //
-             
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_RDMA_WRITE\n");  
-        break;
-
-      case   VAPI_CQE_SQ_RDMA_READ:
-        // about the Send Q
-        // RDMA read completion 
-        // who needs this information
-        // data is successfully read from destionation to source 
-        CDEBUG(D_NET, "CQE opcode- VAPI_CQE_SQ_RDMA_READ\n");  
-        break;
-
-      case   VAPI_CQE_SQ_COMP_SWAP:
-        // about the Send Q
-        // RDMA write completion 
-        // who needs this information
-             
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_COMP_SWAP\n");  
-        break;
-
-      case   VAPI_CQE_SQ_FETCH_ADD:
-        // about the Send Q
-        // RDMA write completion 
-        // who needs this information
-             
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_FETCH_ADD\n");  
-        break;
-
-      case   VAPI_CQE_SQ_BIND_MRW:
-        // about the Send Q
-        // RDMA write completion 
-        // who needs this information
-             
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_BIND_MRW\n");  
-        break;
-
-      case   VAPI_CQE_RQ_SEND_DATA:
-        // about the Receive Q
-        // process the incoming data and
-        // forward it to .....
-        // a completion recevie event is arriving at CQ 
-        // issue a recevie to get this arriving data out from CQ 
-        // pass the receiving data for further processing 
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_SEND_DATA\n");  
-        wrq_id = comp_desc.id ;
-        transferred_data_length = comp_desc.byte_len;
-             
-        if((wrq_id >= RDMA_CTS_ID) && (wrq_id < RDMA_OP_ID)) {
-          // this is RTS/CTS message 
-          // process it locally and don't pass it to portals layer 
-          // adjust wrq_id to get the right entry in MRbfu_list 
-                   
-          if(wrq_id >= RDMA_RTS_ID)
-            wrq_id = wrq_id - RDMA_RTS_ID;
-          else 
-            wrq_id = wrq_id - RDMA_CTS_ID;
-
-          bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) MRbuf_list[wrq_id].buf_addr; 
-          MRbuf_list[wrq_id].status = BUF_INUSE; 
-          memcpy(&Rdma_info, &bufaddr, sizeof(RDMA_Info_Exchange));    
-        
-          if(Ready_To_send == Rdma_info.opcode) 
-            // an RTS request message from remote node 
-            // prepare local RDMA buffer and send local rdma info to
-            // remote node 
-            CTS_handshaking_protocol(&Rdma_info);
-          else 
-            if((Clear_To_send == Rdma_info.opcode) && 
-                              (RDMA_BUFFER_RESERVED == Rdma_info.flag))
-               Cts_Message_arrived = YES;
-            else 
-              if(RDMA_BUFFER_UNAVAILABLE == Rdma_info.flag) 
-                  CERROR("RDMA operation abort-RDMA_BUFFER_UNAVAILABLE\n");
-        }
-        else {
-          //
-          // this is an incoming mesage for portals layer 
-          // move to PORTALS layer for further processing 
-          //
-                     
-          bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
-                                       MRbuf_list[wrq_id].buf_addr; 
-
-          MRbuf_list[wrq_id].status = BUF_INUSE; 
-          transferred_data_length = comp_desc.byte_len;
-
-          kibnal_rx(hca_data->kib_data, 
-                    bufaddr, 
-                    transferred_data_length, 
-                    MRbuf_list[wrq_id].buf_size, 
-                    priority); 
-        }
-
-        // repost this receiving buffer and makr it at BUF_REGISTERED 
-
-        vstat = repost_recv_buf(qp, wrq_id);
-        if(vstat != (VAPI_OK)) {
-          CERROR("error while polling completion queue\n");
-        }
-        else {
-          MRbuf_list[wrq_id].status = BUF_REGISTERED; 
-        }
-
-        break;
-
-      case   VAPI_CQE_RQ_RDMA_WITH_IMM:
-        // about the Receive Q
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");  
-
-        wrq_id = comp_desc.id ;
-        transferred_data_length = comp_desc.byte_len;
-             
-        if(wrq_id ==  RDMA_OP_ID) {
-          // this is RDAM op , locate the RDAM memory buffer address   
-               
-          bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) Local_rdma_info.raddr;
-
-          transferred_data_length = comp_desc.byte_len;
-
-          kibnal_rx(hca_data->kib_data, 
-                    bufaddr, 
-                    transferred_data_length, 
-                    Local_rdma_info.buf_length, 
-                    priority); 
-
-          // de-regiser this RDAM receiving memory buffer
-          // too early ??    test & check 
-          vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.recv_rdma_mr_hndl);
-          if(vstat != VAPI_OK) {
-            CERROR("VAPI_CQE_RQ_RDMA_WITH_IMM: Failed deregistering a RDMA"
-                   " recv  mem region %s\n", VAPI_strerror(vstat));
-          }
-        }
-
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");  
-        break;
-
-      case   VAPI_CQE_INVAL_OPCODE:
-        //
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_INVAL_OPCODE\n");  
-        break;
-
-      default :
-        CDEBUG(D_NET, "CQE opcode-unknown opcode\n");  
-             break;
-    } // switch 
-    
-    schedule_timeout(RECEIVING_THREAD_TIMEOUT);//how often do we need to poll CQ 
-
-  }// receiving while loop
-
-
-}
-
-
-void CQE_event_handler(VAPI_hca_hndl_t hca_hndl, 
-                       VAPI_cq_hndl_t  cq_hndl, 
-                       void           *private)
-{
- VAPI_ret_t       vstat; 
- VAPI_wc_desc_t   comp_desc;   
- unsigned long    polling_count = 0;
- u_int32_t        timeout_usec;
- unsigned int     priority = 100;
- unsigned int     length;
- VAPI_wr_id_t     wrq_id;
- u_int32_t        transferred_data_length; /* Num. of bytes transferred */
- void             *bufdata;
- VAPI_virt_addr_t bufaddr;
- unsigned long    buf_size = 0;
- QP_info          *qp;       // point to QP_list
- HCA_info         *hca_data;
-
- // send Q and receive Q are using the same CQ 
- // so only poll one CQ for both operations 
- CDEBUG(D_NET, "IBNAL- enter CQE_event_handler\n");
- printk("IBNAL- enter CQE_event_handler\n");
-
- hca_data  = (HCA_info *) private; 
-
- //
- // use event driven  
- //
-
- vstat = VAPI_poll_cq(hca_data->hca_hndl,hca_data->cq_hndl, &comp_desc);   
-
- if (vstat == VAPI_CQ_EMPTY) { 
-   CDEBUG(D_NET, "CQE_event_handler: there is no event in CQE, how could"
-                  " this " "happened \n");
-   printk("CQE_event_handler: there is no event in CQE, how could"
-                  " this " "happened \n");
-
- } 
- else {
-   if (vstat != (VAPI_OK)) {
-     CDEBUG(D_NET, "error while polling completion queue vstat %d - %s\n", 
-                vstat, VAPI_strerror(vstat));
-     printk("error while polling completion queue vstat %d - %s\n", 
-                                               vstat, VAPI_strerror(vstat));
-     return; 
-   }
- }
-
- // process the complete event 
- switch(comp_desc.opcode) {
-    case   VAPI_CQE_SQ_SEND_DATA:
-      // about the Send Q ,POST SEND completion 
-      // who needs this information
-      // get wrq_id
-      // mark MSbuf_list[wr_id].status = BUF_REGISTERED 
-               
-      wrq_id = comp_desc.id;
-
-#ifdef IBNAL_SELF_TESTING
-      if(wrq_id == SEND_RECV_TEST_ID) {
-        printk("IBNAL_SELF_TESTING - VAPI_CQE_SQ_SEND_DATA \n"); 
-      }
-#else  
-      if(RDMA_OP_ID < wrq_id) {
-        // this RDMA message id, adjust it to the right entry       
-        wrq_id = wrq_id - RDMA_OP_ID;
-        vstat = VAPI_deregister_mr(qp->hca_hndl, 
-                                   Local_rdma_info.send_rdma_mr_hndl);
-      }
-
-      if(vstat != VAPI_OK) {
-        CERROR(" VAPI_CQE_SQ_SEND_DATA: Failed deregistering a RDMA"
-               " recv  mem region %s\n", VAPI_strerror(vstat));
-      }
-
-      if((RDMA_CTS_ID <= wrq_id) && (RDMA_OP_ID < wrq_id)) {
-        // RTS or CTS send complete, release send buffer 
-        if(wrq_id >= RDMA_RTS_ID)
-          wrq_id = wrq_id - RDMA_RTS_ID;
-        else 
-          wrq_id = wrq_id - RDMA_CTS_ID;
-      }
-
-      spin_lock(&MSB_mutex[(int) wrq_id]);
-      MRbuf_list[wrq_id].status = BUF_REGISTERED; 
-      spin_unlock(&MSB_mutex[(int) wrq_id]);
-#endif 
-
-      CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_SEND_DATA\n");  
-
-      break;
-
-    case   VAPI_CQE_SQ_RDMA_WRITE:
-      // about the Send Q,  RDMA write completion 
-      // who needs this information
-      // data is successfully write from pource to  destionation 
-             
-      //  get wr_id
-      //  mark MSbuf_list[wr_id].status = BUF_REGISTERED 
-      //  de-register  rdma buffer 
-      //
-             
-       CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_RDMA_WRITE\n");  
-       break;
-
-      case   VAPI_CQE_SQ_RDMA_READ:
-        // about the Send Q
-        // RDMA read completion 
-        // who needs this information
-        // data is successfully read from destionation to source 
-         CDEBUG(D_NET, "CQE opcode- VAPI_CQE_SQ_RDMA_READ\n");  
-         break;
-
-      case   VAPI_CQE_SQ_COMP_SWAP:
-        // about the Send Q
-        // RDMA write completion 
-        // who needs this information
-            
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_COMP_SWAP\n");  
-        break;
-
-      case   VAPI_CQE_SQ_FETCH_ADD:
-        // about the Send Q
-        // RDMA write completion 
-        // who needs this information
-             
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_FETCH_ADD\n");  
-        break;
-
-      case   VAPI_CQE_SQ_BIND_MRW:
-        // about the Send Q
-        // RDMA write completion 
-        // who needs this information
-             
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_BIND_MRW\n");  
-        break;
-
-      case   VAPI_CQE_RQ_SEND_DATA:
-        // about the Receive Q
-        // process the incoming data and
-        // forward it to .....
-        // a completion recevie event is arriving at CQ 
-        // issue a recevie to get this arriving data out from CQ 
-        // pass the receiving data for further processing 
-         
-         CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_SEND_DATA\n");  
-          
-         wrq_id = comp_desc.id ;
-
-#ifdef IBNAL_SELF_TESTING
-
-      char        rbuf[KB_32];
-      int i;
-
-      if(wrq_id == SEND_RECV_TEST_ID) {
-        printk("IBNAL_SELF_TESTING - VAPI_CQE_RQ_SEND_DATA\n"); 
-      }
-
-      bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) 
-                       MRbuf_list[ SEND_RECV_TEST_BUF_ID].buf_addr; 
-      MRbuf_list[SEND_RECV_TEST_BUF_ID].status = BUF_INUSE; 
-      memcpy(&rbuf, &bufaddr, KB_32);    
-      
-
-      for(i=0; i < 16; i++)
-              printk("rbuf[%d]=%c, ", rbuf[i]);
-      printk("\n");
-
-      // repost this receiving buffer and makr it at BUF_REGISTERED 
-      vstat = repost_recv_buf(qp,SEND_RECV_TEST_BUF_ID);
-      if(vstat != (VAPI_OK)) {
-        printk("error while polling completion queue\n");
-      }
-      else {
-        MRbuf_list[SEND_RECV_TEST_BUF_ID].status = BUF_REGISTERED; 
-      }
-#else  
-         transferred_data_length = comp_desc.byte_len;
-             
-         if((wrq_id >= RDMA_CTS_ID) && (wrq_id < RDMA_OP_ID)) {
-           // this is RTS/CTS message 
-           // process it locally and don't pass it to portals layer 
-           // adjust wrq_id to get the right entry in MRbfu_list 
-                   
-           if(wrq_id >= RDMA_RTS_ID)
-             wrq_id = wrq_id - RDMA_RTS_ID;
-           else 
-             wrq_id = wrq_id - RDMA_CTS_ID;
-
-           bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) 
-                                           MRbuf_list[wrq_id].buf_addr; 
-           MRbuf_list[wrq_id].status = BUF_INUSE; 
-           memcpy(&Rdma_info, &bufaddr, sizeof(RDMA_Info_Exchange));    
-        
-           if(Ready_To_send == Rdma_info.opcode) 
-             // an RTS request message from remote node 
-             // prepare local RDMA buffer and send local rdma info to
-             // remote node 
-             CTS_handshaking_protocol(&Rdma_info);
-           else 
-             if((Clear_To_send == Rdma_info.opcode) && 
-                                (RDMA_BUFFER_RESERVED == Rdma_info.flag))
-               Cts_Message_arrived = YES;
-             else 
-               if(RDMA_BUFFER_UNAVAILABLE == Rdma_info.flag) 
-                 CERROR("RDMA operation abort-RDMA_BUFFER_UNAVAILABLE\n");
-         }
-         else {
-           //
-           // this is an incoming mesage for portals layer 
-           // move to PORTALS layer for further processing 
-           //
-                     
-           bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
-                                MRbuf_list[wrq_id].buf_addr; 
-
-           MRbuf_list[wrq_id].status = BUF_INUSE; 
-           transferred_data_length = comp_desc.byte_len;
-
-           kibnal_rx(hca_data->kib_data, 
-                     bufaddr, 
-                     transferred_data_length, 
-                     MRbuf_list[wrq_id].buf_size, 
-                     priority); 
-         }
-
-         // repost this receiving buffer and makr it at BUF_REGISTERED 
-         vstat = repost_recv_buf(qp, wrq_id);
-         if(vstat != (VAPI_OK)) {
-           CERROR("error while polling completion queue\n");
-         }
-         else {
-           MRbuf_list[wrq_id].status = BUF_REGISTERED; 
-         }
-#endif
-
-         break;
-
-      case   VAPI_CQE_RQ_RDMA_WITH_IMM:
-        // about the Receive Q
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");  
-
-        wrq_id = comp_desc.id ;
-        transferred_data_length = comp_desc.byte_len;
-             
-        if(wrq_id ==  RDMA_OP_ID) {
-          // this is RDAM op , locate the RDAM memory buffer address   
-              
-          bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) Local_rdma_info.raddr;
-
-          transferred_data_length = comp_desc.byte_len;
-
-          kibnal_rx(hca_data->kib_data, 
-                    bufaddr, 
-                    transferred_data_length, 
-                    Local_rdma_info.buf_length, 
-                    priority); 
-
-          // de-regiser this RDAM receiving memory buffer
-          // too early ??    test & check 
-          vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.recv_rdma_mr_hndl);
-          if(vstat != VAPI_OK) {
-            CERROR("VAPI_CQE_RQ_RDMA_WITH_IMM: Failed deregistering a RDMA"
-               " recv  mem region %s\n", VAPI_strerror(vstat));
-          }
-        }
-
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");  
-        break;
-
-      case   VAPI_CQE_INVAL_OPCODE:
-        //
-        CDEBUG(D_NET, "CQE opcode-VAPI_CQE_INVAL_OPCODE\n");  
-        break;
-
-      default :
-        CDEBUG(D_NET, "CQE opcode-unknown opcode\n");  
-
-        break;
-    } // switch 
-    
-  // issue a new request for completion ievent notification 
-  vstat = VAPI_req_comp_notif(hca_data->hca_hndl, 
-                              hca_data->cq_hndl,
-                              VAPI_NEXT_COMP); 
-
-
-  if(vstat != VAPI_OK) {
-    CERROR("PI_req_comp_notif: Failed %s\n", VAPI_strerror(vstat));
-  }
-
-  return; // end of event handler 
-
-}
-
-
-
-int
-kibnal_cmd(struct portal_ioctl_data * data, void * private)
-{
-  int rc ;
-
-  CDEBUG(D_NET, "kibnal_cmd \n");  
-
-  return YES;
-}
-
-
-
-void ibnal_send_recv_self_testing(int *my_role)
-{
- VAPI_ret_t           vstat;
- VAPI_sr_desc_t       sr_desc;
- VAPI_sg_lst_entry_t  sr_sg;
- QP_info              *qp;
- VAPI_wr_id_t         send_id;
- int                  buf_id;
- char                 sbuf[KB_32];
- char                 rbuf[KB_32];
- int                  i;
- int                  buf_length = KB_32;
- VAPI_wc_desc_t       comp_desc;
- int                  num_send = 1;
- int                  loop_count = 0;
-
- // make it as a daemon process 
- // kportal_daemonize("ibnal_send_recv_self_testing");  
-
- printk("My role is 0X%X\n", *my_role);
-
-if(*my_role ==  TEST_SEND_MESSAGE)  {
- printk("Enter ibnal_send_recv_self_testing\n");
-
- memset(&sbuf, 'a', KB_32);
- memset(&rbuf, ' ', KB_32);
- send_id = SEND_RECV_TEST_ID; 
- buf_id = SEND_RECV_TEST_BUF_ID;
-
- qp = &QP_list[buf_id];
-
- sr_desc.opcode    = VAPI_SEND;
- sr_desc.comp_type = VAPI_SIGNALED;
- sr_desc.id        =  send_id;
-
- // scatter and gather info
- sr_sg.len  = KB_32;
- sr_sg.lkey = MSbuf_list[buf_id].mr.l_key; // use send MR
- sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[buf_id].buf_addr;
-
- // copy data to register send buffer
- memcpy(&sr_sg.addr, &sbuf, buf_length);
-
- sr_desc.sg_lst_p = &sr_sg;
- sr_desc.sg_lst_len = 1; // only 1 entry is used
- sr_desc.fence = TRUE;
- sr_desc.set_se = FALSE;
-
- /*
- // call VAPI_post_sr to send out this data
- vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc);
-
- if (vstat != VAPI_OK) {
-   printk("VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat));
- }
-
- printk("VAPI_post_sr success.\n");
- */
-
- }
-else {
-  printk("I am a receiver and doing nothing here\n"); 
-}
-         
- printk("ibnal_send_recv_self_testing thread exit \n");
-
- return;
-
-}
-
-
-//
-// ibnal initialize process  
-//
-// 1.  Bring up Infiniband network interface 
-//     * 
-// 2.  Initialize a PORTALS nal interface 
-// 
-//
-int __init 
-kibnal_initialize(void)
-{
-   int           rc;
-   int           ntok;
-   unsigned long sizemask;
-   unsigned int  nid;
-   VAPI_ret_t    vstat;
-
-
-   portals_debug_set_level(IBNAL_DEBUG_LEVEL_1);
-
-   CDEBUG(D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
-
-   CDEBUG(D_PORTALS, "kibnal_initialize: Enter kibnal_initialize\n");
-
-   // set api functional pointers 
-   kibnal_api.forward    = kibnal_forward;
-   kibnal_api.shutdown   = kibnal_shutdown;
-   kibnal_api.yield      = kibnal_yield;
-   kibnal_api.validate   = NULL; /* our api validate is a NOOP */
-   kibnal_api.lock       = kibnal_lock;
-   kibnal_api.unlock     = kibnal_unlock;
-   kibnal_api.nal_data   = &kibnal_data; // this is so called private data 
-   kibnal_api.refct      = 1;
-   kibnal_api.timeout    = NULL;
-   kibnal_lib.nal_data   = &kibnal_data;
-  
-   memset(&kibnal_data, 0, sizeof(kibnal_data));
-
-   // initialize kib_list list data structure 
-   INIT_LIST_HEAD(&kibnal_data.kib_list);
-
-   kibnal_data.kib_cb = &kibnal_lib;
-
-   spin_lock_init(&kibnal_data.kib_dispatch_lock);
-
-
-   //  
-   // bring up the IB inter-connect network interface 
-   // setup QP, CQ 
-   //
-   vstat = IB_Open_HCA(&kibnal_data);
-
-   if(vstat != VAPI_OK) {
-     CERROR("kibnal_initialize: IB_Open_HCA failed: %d- %s\n", 
-                                                vstat, VAPI_strerror(vstat));
-
-     printk("kibnal_initialize: IB_Open_HCA failed: %d- %s\n", 
-                                                vstat, VAPI_strerror(vstat));
-     return NO;
-   }
-
-   kibnal_data.kib_nid = (__u64 )Hca_hndl;//convert Hca_hndl to 64-bit format
-   kibnal_data.kib_init = 1;
-
-   CDEBUG(D_NET, " kibnal_data.kib_nid 0x%x%x\n", kibnal_data.kib_nid);
-   printk(" kibnal_data.kib_nid 0x%x%x\n", kibnal_data.kib_nid);
-
-   /* Network interface ready to initialise */
-   // get an entery in the PORTALS table for this IB protocol 
-
-   CDEBUG(D_PORTALS,"Call PtlNIInit to register this Infiniband Interface\n");
-   printk("Call PtlNIInit to register this Infiniband Interface\n");
-
-   rc = PtlNIInit(kibnal_init, 32, 4, 0, &kibnal_ni);
-
-   if(rc != PTL_OK) {
-     CERROR("kibnal_initialize: PtlNIInit failed %d\n", rc);
-     printk("kibnal_initialize: PtlNIInit failed %d\n", rc);
-     kibnal_finalize();
-     return (-ENOMEM);
-   }
-
-   CDEBUG(D_PORTALS,"kibnal_initialize: PtlNIInit DONE\n");
-   printk("kibnal_initialize: PtlNIInit DONE\n");
-
-
-
-#ifdef  POLL_BASED_CQE_HANDLING 
-   // create a receiving thread: main loopa
-   // this is polling based mail loop   
-   kernel_thread(k_recv_thread, &Hca_data, 0);
-#endif
-
-#ifdef EVENT_BASED_CQE_HANDLING
-  // for completion event handling,  this is event based CQE handling 
-  vstat = IB_Set_Event_Handler(Hca_data, &kibnal_data);
-
-  if (vstat != VAPI_OK) {
-     CERROR("IB_Set_Event_Handler failed: %d - %s \n", 
-                                           vstat, VAPI_strerror(vstat));
-     return vstat;
-  }
-
-  CDEBUG(D_PORTALS,"IB_Set_Event_Handler Done \n");
-  printk("IB_Set_Event_Handler Done \n");
-  
-#endif
-
-   PORTAL_SYMBOL_REGISTER(kibnal_ni);
-
-#ifdef IBNAL_SELF_TESTING
-  //
-  // test HCA send recv before normal event handling 
-  //
-  int  my_role;
-  my_role = TEST_SEND_MESSAGE;
-
-  printk("my role is TEST_RECV_MESSAGE\n");
-
-  // kernel_thread(ibnal_send_recv_self_testing, &my_role, 0);
-   
-  ibnal_send_recv_self_testing(&my_role);
-
-#endif 
-
-  return 0;
-
-}
-
-
-
-MODULE_AUTHOR("Hsingbung(HB) Chen <hbchen@lanl.gov>");
-MODULE_DESCRIPTION("Kernel Infiniband NAL v0.1");
-MODULE_LICENSE("GPL");
-
-module_init (kibnal_initialize);
-module_exit (kibnal_finalize);
-
-EXPORT_SYMBOL(kibnal_ni);
-
diff --git a/lustre/portals/knals/ibnal/ibnal.h b/lustre/portals/knals/ibnal/ibnal.h
deleted file mode 100644 (file)
index 4a1f0d7..0000000
+++ /dev/null
@@ -1,565 +0,0 @@
-#ifndef _IBNAL_H
-#define _IBNAL_H
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/segment.h>
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-
-#include <linux/ipc.h>
-#include <linux/shm.h>
-
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/locks.h>
-#include <linux/unistd.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/list.h>
-#include <linux/in.h>
-#include <unistd.h>
-
-#define DEBUG_SUBSYSTEM S_IBNAL
-
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
-#include <linux/kp30.h>
-#include <linux/kpr.h>
-
-// Infiniband VAPI/EVAPI header files  
-// Mellanox MT23108 VAPI
-#include <vapi.h>
-#include <vapi_types.h>
-#include <vapi_common.h>
-#include <evapi.h>
-
-// pick a port for this RDMA information exhange between two hosts
-#define HOST_PORT           11211 
-#define QUEUE_SIZE          1024
-#define HCA_PORT_1          1
-#define HCA_PORT_2          2 
-#define DEBUG_SUBSYSTEM S_IBNAL
-
-#define START_SEND_WRQ_ID        0
-#define START_RECV_WRQ_ID        0
-#define START_RDMA_WRQ_ID        0  
-
-#define DEFAULT_PRIORITY         100
-
-#define WAIT_FOT_R_RDMA_TIMEOUT 10000
-#define MAX_NUM_TRY      3000 
-
-#define MAX_NUM_POLL     300 
-#define MAX_LOOP_COUNT   500
-
-#define MAX_GID          32 
-#define MCG_BUF_LENGTH   128
-
-#define SHARED_SEGMENT_SIZE   0x10000   
-#define HCA_EXCHANGE_SHM_KEY  999 // shared memory key for HCA data exchange 
-
-// some internals opcodes for IB operations used in IBNAL
-#define SEND_QP_INFO          0X00000001 
-#define RECV_QP_INFO          0X00000010 
-
-// Mellanox InfiniHost MT23108 
-// QP/CQ related information
-//
-
-#define MTU_256     1 /* 1-256,2-512,3-1024,4-2048 */
-#define MTU_512     2 /* 1-256,2-512,3-1024,4-2048 */
-#define MTU_1024    3 /* 1-256,2-512,3-1024,4-2048 */
-#define MTU_2048    4 /* 1-256,2-512,3-1024,4-2048 */
-
-// number of entries for each CQ and WQ 
-// how much do we need ?
-#define NUM_CQE        1024
-#define NUM_WQE        1024 
-#define MAX_OUT_SQ     64 
-#define MAX_OUT_RQ     64
-
-#define NUM_MBUF       256 
-#define NUM_RDMA_RESERVED_ENTRY 128 
-#define NUM_QPS        256 
-
-#define INVALID_WR_ID  ((VAPI_wr_id_t) -1)
-
-
-// for Vector IO 
-// scatter and gather 
-// Portals can support upto 64 IO-Vectors 
-// how much do we need ? 
-#define NUM_SGE        1 
-#define NUM_SG         1 
-#define NUM_CQ        1        
-
-#define ONE_KB    1024
-#define ONE_MB    1024 * ONE_KB 
-#define ONE_GB    1024 * ONE_MB 
-
-
-#define KB_4      1024 * 4 
-#define KB_8      1024 * 8 
-#define KB_16     1024 * 16
-#define KB_32     1024 * 32
-#define KB_64     1024 * 64
-#define KB_128    1024 * 128 
-#define KB_256    1024 * 256 
-
-// 256 entry in registered buffer list 
-// small size message 
-#define Num_4_KB       64 
-#define Num_8_KB       64 
-#define Num_16_KB      40 
-#define Num_32_KB      40 
-#define Num_64_KB      40 
-#define Num_128_KB     4 
-#define Num_256_KB     4 
-
-#define SMALL_MSG_SIZE KB_32     
-
-#define MAX_MSG_SIZE   ONE_MB * 512   
-
-//   128's  64KB bufer for send
-//   128's  64KB bufer for recv  
-//   used in RDAM operation only 
-
-#define NUM_ENTRY      128 
-
-#define End_4_kb        Num_4_KB 
-#define End_8_kb        End_4_kb  + Num_8_KB 
-#define End_16_kb       End_8_kb  + Num_16_KB
-#define End_32_kb       End_16_kb + Num_32_KB
-#define End_64_kb       End_32_kb + Num_64_KB
-#define End_128_kb      End_64_kb + Num_128_KB
-#define End_256_kb      End_128_kb+ Num_256_KB
-
-
-#define SEND_BUF_SIZE   KB_32
-#define RECV_BUF_SIZE   SEND_BUF_SIZE
-
-// #define POLL_BASED_CQE_HANDLING     1
-#define EVENT_BASED_CQE_HANDLING        1
-#define IBNAL_SELF_TESTING             1
-
-#ifdef  IBNAL_SELF_TESTING
-#undef  IBNAL_SELF_TESTING
-#endif
-
-
-#define MSG_SIZE_SMALL 1 
-#define MSG_SIZE_LARGE 2 
-
-
-
-// some defauly configuration values for early testing 
-#define DEFAULT_DLID   1  // default destination link ID
-#define DEFAULT_QP_NUM 4  // default QP number 
-#define P_KEY          0xFFFF // do we need default value
-#define PKEY_IX        0x0 // do we need default value
-#define Q_KEY          0x012  // do we need default value 
-#define L_KEY          0x12345678 // do we need default value 
-#define R_KEY          0x87654321 // do we need default value 
-#define HCA_ID         "InfiniHost0" // default 
-#define START_PSN      0
-#define START_SQ_PSN   0
-#define START_RQ_PSN   0
-
-
-#define __u_long_long   unsigned long long
-
-#define         IBNAL_DEBUG      1
-
-#define         USE_SHARED_MEMORY_AND_SOCKET 1
-
-// operation type
-#define TRY_SEND_ONLY    1
-
-#define YES     1  
-#define NO      0 
-
-//
-// a common data structure for IB QP's operation
-// each QP is associated with an QP_info structure 
-//
-typedef struct QP_info 
-{
-  VAPI_hca_hndl_t       hca_hndl;      // HCA handle
-  IB_port_t             port;          // port number 
-  VAPI_qp_hndl_t        qp_hndl;       // QP's handle list 
-  VAPI_qp_state_t       qp_state;      // QP's current state 
-  VAPI_pd_hndl_t        pd_hndl;       // protection domain
-  VAPI_cq_hndl_t        cq_hndl;    // send-queue CQ's handle 
-  VAPI_cq_hndl_t        sq_cq_hndl;    // send-queue CQ's handle 
-  VAPI_cq_hndl_t        rq_cq_hndl;    // receive-queue CQ's handle
-  VAPI_ud_av_hndl_t     av_hndl;    // receive-queue CQ's handle
-  VAPI_qp_init_attr_t   qp_init_attr;  // QP's init attribute 
-  VAPI_qp_attr_t        qp_attr;       // QP's attribute - dlid 
-  VAPI_qp_prop_t        qp_prop;       // QP's propertities
-  VAPI_hca_port_t       hca_port;  
-  VAPI_qp_num_t         qp_num;    // QP's number 
-  VAPI_qp_num_t         rqp_num;       // remote QP's number 
-  IB_lid_t              slid;
-  IB_lid_t              dlid;
-  VAPI_gid_t            src_gid;
-
-  u_int32_t            buf_size;
-  VAPI_virt_addr_t      buf_addr;
-  char                *bufptr;
-  VAPI_mrw_t            mr;       
-  VAPI_mr_hndl_t        mr_hndl;
-  VAPI_virt_addr_t      raddr;
-  VAPI_rkey_t           rkey;
-  VAPI_lkey_t           lkey;
-
-  VAPI_wr_id_t          last_posted_send_id; // user defined work request ID 
-  VAPI_wr_id_t          last_posted_rcv_id;  // user defined work request ID
-  VAPI_mw_hndl_t        mw_hndl;       // memory window handle 
-  VAPI_rkey_t           mw_rkey;       // memory window rkey
-  VAPI_sg_lst_entry_t   sg_lst[256];       // scatter and gather list 
-  int                   sg_list_sz;    // set as NUM_SGE
-  VAPI_wr_id_t          wr_id;         //
-  spinlock_t            snd_mutex;
-  spinlock_t            rcv_mutex;
-  spinlock_t            bl_mutex;
-  spinlock_t            cln_mutex;
-  int                   cur_RDMA_outstanding;
-  int                   cur_send_outstanding;
-  int                   cur_posted_rcv_bufs;
-  int                   snd_rcv_balance;
-} QP_info; 
-
-
-// buffer status 
-#define  BUF_REGISTERED   0x10000000 
-#define  BUF_INUSE       0x01000000  
-#define  BUF_UNREGISTERED 0x00100000 
-
-// buffer type 
-#define  REG_BUF          0x10000000
-#define  RDMA_BUF         0x01000000 
-
-//
-// IMM data 
-// 
-#define   IMM_000         (0 << 32); 
-#define   IMM_001         (1 << 32); 
-#define   IMM_002         (2 << 32); 
-#define   IMM_003         (3 << 32); 
-#define   IMM_004         (4 << 32); 
-#define   IMM_005         (5 << 32); 
-#define   IMM_006         (6 << 32); 
-#define   IMM_007         (7 << 32); 
-#define   IMM_008         (8 << 32); 
-#define   IMM_009         (9 << 32); 
-#define   IMM_010         (10 << 32); 
-#define   IMM_011         (11 << 32); 
-#define   IMM_012         (12 << 32); 
-#define   IMM_013         (13 << 32); 
-#define   IMM_014         (14 << 32); 
-#define   IMM_015         (15 << 32); 
-#define   IMM_016         (16 << 32); 
-#define   IMM_017         (17 << 32); 
-#define   IMM_018         (18 << 32); 
-#define   IMM_019         (19 << 32); 
-#define   IMM_020         (20 << 32); 
-#define   IMM_021         (21 << 32); 
-#define   IMM_022         (22 << 32); 
-#define   IMM_023         (23 << 32); 
-#define   IMM_024         (24 << 32); 
-#define   IMM_025         (25 << 32); 
-#define   IMM_026         (26 << 32); 
-#define   IMM_027         (27 << 32); 
-#define   IMM_028         (28 << 32); 
-#define   IMM_029         (29 << 32); 
-#define   IMM_030         (30 << 32); 
-#define   IMM_031         (31 << 32); 
-
-
-typedef struct Memory_buffer_info{
-       u_int32_t        buf_size;
-       VAPI_virt_addr_t buf_addr;
-       char             *bufptr;
-       VAPI_mrw_t       mr;       
-       VAPI_mr_hndl_t   mr_hndl;
-        int              status;
-       int              ref_count;  
-        int              buf_type;
-       VAPI_virt_addr_t raddr;
-       VAPI_rkey_t      rkey;
-       VAPI_lkey_t      lkey;
-} Memory_buffer_info;
-
-typedef struct RDMA_Info_Exchange {
-       int               opcode;
-       int               buf_length;
-       VAPI_mrw_t        recv_rdma_mr;
-       VAPI_mr_hndl_t    recv_rdma_mr_hndl;
-       VAPI_mrw_t        send_rdma_mr;
-       VAPI_mr_hndl_t    send_rdma_mr_hndl;
-       VAPI_virt_addr_t  raddr;
-       VAPI_rkey_t       rkey;
-       int               flag;
-}  RDMA_Info_Exchange;
-
-// opcode for Rdma info exchange RTS/CTS 
-#define  Ready_To_send     0x10000000
-#define  Clear_To_send     0x01000000
-
-#define  RDMA_RTS_ID      5555 
-#define  RDMA_CTS_ID      7777 
-#define  RDMA_OP_ID       9999 
-#define  SEND_RECV_TEST_ID 2222 
-#define  SEND_RECV_TEST_BUF_ID 0 
-
-#define  TEST_SEND_MESSAGE 0x00000001 
-#define  TEST_RECV_MESSAGE 0x00000002
-
-
-#define  RTS_CTS_TIMEOUT           50
-#define  RECEIVING_THREAD_TIMEOUT  50 
-#define  WAIT_FOR_SEND_BUF_TIMEOUT 50
-
-#define  IBNAL_DEBUG_LEVEL_1   0XFFFFFFFF  
-#define  IBNAL_DEBUG_LEVEL_2   D_PORTALS | D_NET   | D_WARNING | D_MALLOC | \ 
-                              D_ERROR   | D_OTHER | D_TRACE   | D_INFO
-                              
-
-// flag for Rdma info exhange 
-#define  RDMA_BUFFER_RESERVED       0x10000000
-#define  RDMA_BUFFER_UNAVAILABLE    0x01000000
-
-
-// receiving data structure 
-typedef struct {
-        ptl_hdr_t         *krx_buffer; // pointer to receiving buffer
-        unsigned long     krx_len;  // length of buffer
-        unsigned int      krx_size; // 
-        unsigned int      krx_priority; // do we need this 
-        struct list_head  krx_item;
-}  kibnal_rx_t;
-
-// transmitting data structure 
-typedef struct {
-        nal_cb_t      *ktx_nal;
-        void          *ktx_private;
-        lib_msg_t     *ktx_cookie;
-        char          *ktx_buffer;
-        size_t         ktx_len;
-        unsigned long  ktx_size;
-        int            ktx_ndx;
-        unsigned int   ktx_priority;
-        unsigned int   ktx_tgt_node;
-        unsigned int   ktx_tgt_port_id;
-}  kibnal_tx_t;
-
-
-typedef struct {
-        char              kib_init;
-        char              kib_shuttingdown;
-        IB_port_t         port_num; // IB port information
-        struct list_head  kib_list;
-        ptl_nid_t         kib_nid;
-        nal_t            *kib_nal; 
-        nal_cb_t         *kib_cb;
-        struct kib_trans *kib_trans; // do I need this 
-        struct tq_struct  kib_ready_tq;
-        spinlock_t        kib_dispatch_lock;
-}  kibnal_data_t;
-
-
-//
-// A data structure for keeping the HCA information in system
-// information related to HCA and hca_handle will be kept here 
-//
-typedef struct HCA_Info 
-{
-  VAPI_hca_hndl_t       hca_hndl;     // HCA handle
-  VAPI_pd_hndl_t        pd_hndl;      // protection domain
-  IB_port_t             port;         // port number 
-  int                   num_qp;       // number of qp used  
-  QP_info               *qp_ptr[NUM_QPS]; // point to QP_list
-  int                   num_cq;       // number of cq used 
-  VAPI_cq_hndl_t        cq_hndl;   
-  VAPI_cq_hndl_t        sq_cq_hndl;   
-  VAPI_cq_hndl_t        rq_cq_hndl;   
-  IB_lid_t              dlid;
-  IB_lid_t              slid;
-  kibnal_data_t         *kib_data; // for PORTALS operations
-} HCA_info;
-
-
-
-
-// Remote HCA Info information 
-typedef struct Remote_HCA_Info {
-        unsigned long     opcode;
-        unsigned long     length; 
-        IB_lid_t          dlid[NUM_QPS];
-        VAPI_qp_num_t     rqp_num[NUM_QPS];
-} Remote_QP_Info;
-
-typedef struct  Bucket_index{
-     int start;
-     int end;
-} Bucket_index;
-
-// functional prototypes 
-// infiniband initialization 
-int kib_init(kibnal_data_t *);
-
-// receiving thread 
-void kibnal_recv_thread(HCA_info *);
-void recv_thread(HCA_info *);
-
-// forward data packet 
-void kibnal_fwd_packet (void *, kpr_fwd_desc_t *);
-
-// global data structures 
-extern kibnal_data_t        kibnal_data;
-extern ptl_handle_ni_t      kibnal_ni;
-extern nal_t                kibnal_api;
-extern nal_cb_t             kibnal_lib;
-extern QP_info              QP_list[];
-extern QP_info              CQ_list[];
-extern HCA_info             Hca_data;
-extern VAPI_hca_hndl_t      Hca_hndl; 
-extern VAPI_pd_hndl_t       Pd_hndl;
-extern VAPI_hca_vendor_t    Hca_vendor;
-extern VAPI_hca_cap_t       Hca_cap;
-extern VAPI_hca_port_t      Hca_port_1_props;
-extern VAPI_hca_port_t      Hca_port_2_props;
-extern VAPI_hca_attr_t      Hca_attr;
-extern VAPI_hca_attr_mask_t Hca_attr_mask;
-extern VAPI_cq_hndl_t       Cq_SQ_hndl;   
-extern VAPI_cq_hndl_t       Cq_RQ_hndl;   
-extern VAPI_cq_hndl_t       Cq_hndl;   
-extern unsigned long        User_Defined_Small_Msg_Size;
-extern Remote_QP_Info      L_HCA_RDMA_Info;  
-extern Remote_QP_Info      R_HCA_RDMA_Info; 
-extern unsigned int         Num_posted_recv_buf;
-extern int                  R_RDMA_DATA_ARRIVED;
-extern Memory_buffer_info   MRbuf_list[];
-extern Memory_buffer_info   MSbuf_list[];
-extern Bucket_index         Bucket[]; 
-extern RDMA_Info_Exchange   Rdma_info;
-extern int                  Cts_Message_arrived;
-extern RDMA_Info_Exchange   Local_rdma_info;
-extern spinlock_t          MSB_mutex[];
-
-
-
-// kernel NAL API function prototype 
-int  kibnal_forward(nal_t *,int ,void *,size_t ,void *,size_t );
-void kibnal_lock(nal_t *, unsigned long *);
-void kibnal_unlock(nal_t *, unsigned long *);
-int  kibnal_shutdown(nal_t *, int );
-void kibnal_yield( nal_t * );
-void kibnal_invalidate(nal_cb_t *,void *,size_t ,void *);
-int  kibnal_validate(nal_cb_t *,void *,size_t ,void  **);
-
-
-
-nal_t *kibnal_init(int , ptl_pt_index_t , ptl_ac_index_t , ptl_pid_t );
-void __exit kibnal_finalize(void ); 
-VAPI_ret_t create_qp(QP_info *, int );
-VAPI_ret_t init_qp(QP_info *, int );
-VAPI_ret_t IB_Open_HCA(kibnal_data_t *);
-VAPI_ret_t IB_Close_HCA(void );
-VAPI_ret_t createMemRegion(VAPI_hca_hndl_t, VAPI_pd_hndl_t); 
-VAPI_ret_t  deleteMemRegion(QP_info *, int );
-
-void ibnal_send_recv_self_testing(int *);
-
-int  __init kibnal_initialize(void);
-
-
-
-/* CB NAL functions */
-int kibnal_send(nal_cb_t *, 
-                void *, 
-                lib_msg_t *, 
-                ptl_hdr_t *,
-                int, 
-                ptl_nid_t, 
-                ptl_pid_t, 
-                unsigned int, 
-                ptl_kiov_t *, 
-                size_t);
-
-int kibnal_send_pages(nal_cb_t *, 
-                      void *, 
-                      lib_msg_t *, 
-                      ptl_hdr_t *,
-                      int, 
-                      ptl_nid_t, 
-                      ptl_pid_t, 
-                      unsigned int, 
-                      ptl_kiov_t *, 
-                      size_t);
-int kibnal_recv(nal_cb_t *, void *, lib_msg_t *,
-                        unsigned int, struct iovec *, size_t, size_t);
-int kibnal_recv_pages(nal_cb_t *, void *, lib_msg_t *,
-                        unsigned int, ptl_kiov_t *, size_t, size_t);
-int  kibnal_read(nal_cb_t *,void *,void *,user_ptr ,size_t );
-int  kibnal_write(nal_cb_t *,void *,user_ptr ,void *,size_t );
-int  kibnal_callback(nal_cb_t * , void *, lib_eq_t *, ptl_event_t *);
-void *kibnal_malloc(nal_cb_t *,size_t );
-void kibnal_free(nal_cb_t *,void *,size_t );
-int  kibnal_map(nal_cb_t *, unsigned int , struct iovec *, void **);
-void kibnal_unmap(nal_cb_t *, unsigned int , struct iovec *, void **);
-int  kibnal_map_pages(nal_cb_t *, unsigned int , ptl_kiov_t *, void **);
-void kibnal_unmap_pages(nal_cb_t * , unsigned int , ptl_kiov_t *, void **);
-void kibnal_printf(nal_cb_t *, const char *, ...);
-void kibnal_cli(nal_cb_t *,unsigned long *); 
-void kibnal_sti(nal_cb_t *,unsigned long *);
-int  kibnal_dist(nal_cb_t *,ptl_nid_t ,unsigned long *);
-
-void kibnal_fwd_packet (void *, kpr_fwd_desc_t *);
-void kibnal_rx(kibnal_data_t *, 
-               VAPI_virt_addr_t ,
-               u_int32_t,
-               u_int32_t,
-               unsigned int);
-                
-int  kibnal_end(kibnal_data_t *);
-
-void async_event_handler(VAPI_hca_hndl_t , VAPI_event_record_t *,void *);
-
-void CQE_event_handler(VAPI_hca_hndl_t ,VAPI_cq_hndl_t , void  *);
-
-
-VAPI_ret_t Send_Small_Msg(char *, int );
-VAPI_ret_t Send_Large_Msg(char *, int );
-
-VAPI_ret_t repost_recv_buf(QP_info *, VAPI_wr_id_t );
-int post_recv_bufs(VAPI_wr_id_t );
-int  server_listen_thread(void *);
-VAPI_wr_id_t RTS_handshaking_protocol(int );
-VAPI_wr_id_t CTS_handshaking_protocol(RDMA_Info_Exchange *);
-
-VAPI_ret_t createMemRegion_RDMA(VAPI_hca_hndl_t ,
-                               VAPI_pd_hndl_t  ,
-                               char         *,
-                               int             , 
-                               VAPI_mr_hndl_t  *,
-                               VAPI_mrw_t      *);
-
-
-VAPI_ret_t IB_Set_Event_Handler(HCA_info , kibnal_data_t *);
-
-VAPI_ret_t IB_Set_Async_Event_Handler(HCA_info ,kibnal_data_t *);
-
-VAPI_wr_id_t find_available_buf(int );
-VAPI_wr_id_t search_send_buf(int );
-VAPI_wr_id_t find_filler_list(int ,int );
-int insert_MRbuf_list(int );
-
-
-#endif  /* _IBNAL_H */
diff --git a/lustre/portals/knals/ibnal/ibnal_cb.c b/lustre/portals/knals/ibnal/ibnal_cb.c
deleted file mode 100644 (file)
index 0688062..0000000
+++ /dev/null
@@ -1,1289 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Based on ksocknal and qswnal
- *
- *  Author: Hsing-bung Chen <hbchen@lanl.gov>
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- *
- *   Portals is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Portals is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Portals; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-
-#include "ibnal.h"
-
-
-
-
-RDMA_Info_Exchange   Rdma_nfo;
-int  Cts_Msg_Arrived = NO;
-
-
-/*
- *  LIB functions follow
- */
-
-//
-// read
-// copy a block of data from scr_addr to dst_addr 
-// it all happens in kernel space - dst_addr and src_addr 
-//
-// original definition is to read a block od data from a 
-// specified user address  
-// 
-// cb_read
-
-int kibnal_read (nal_cb_t *nal, 
-                 void     *private, 
-                 void     *dst_addr, 
-                 user_ptr src_addr, 
-                 size_t   len)
-{
-        CDEBUG(D_NET, "kibnal_read: 0x%Lx: reading %ld bytes from %p -> %p\n",
-               nal->ni.nid, (long)len, src_addr, dst_addr );
-
-        memcpy( dst_addr, src_addr, len );
-
-        return 0;
-}
-
-//
-// it seems that read and write are doing the same thing
-// because they all happen in kernel space 
-// why do we need two functions like read and write 
-// to make PORTALS API compatable 
-//
-
-//
-// write 
-// copy a block of data from scr_addr to dst_addr 
-// it all happens in kernel space - dst_addr and src_addr 
-//
-// original definition is to write a block od data to a 
-// specified user address  
-// 
-// cb_write
-
-int kibnal_write(nal_cb_t   *nal, 
-                 void       *private, 
-                 user_ptr   dst_addr, 
-                 void       *src_addr, 
-                 size_t     len)
-{
-        CDEBUG(D_NET, "kibnal_write: 0x%Lx: writing %ld bytes from %p -> %p\n",
-               nal->ni.nid, (long)len, src_addr, dst_addr );
-
-
-        memcpy( dst_addr, src_addr, len );
-
-        return 0;
-}
-
-//
-// malloc
-//
-// either vmalloc or kmalloc is used 
-// dynamically allocate a block of memory based on the size of buffer  
-//
-// cb_malloc
-
-void * kibnal_malloc(nal_cb_t *nal, size_t length)
-{
-        void *buffer;
-
-        // PORTAL_ALLOC will do the job 
-        // allocate a buffer with size "length"
-        PORTAL_ALLOC(buffer, length);
-
-        return buffer;
-}
-
-//
-// free
-// release a dynamically allocated memory pointed by buffer pointer 
-//
-// cb_free
-
-void kibnal_free(nal_cb_t *nal, void *buffer, size_t length)
-{
-        //
-        // release allocated buffer to system 
-        //
-        PORTAL_FREE(buffer, length);
-}
-
-//
-// invalidate 
-// because evernthing is in kernel space (LUSTRE)
-// there is no need to mark a piece of user memory as no longer in use by
-// the system
-//
-// cb_invalidate
-
-void kibnal_invalidate(nal_cb_t      *nal, 
-                              void          *base, 
-                              size_t        extent, 
-                              void          *addrkey)
-{
-  // do nothing 
-  CDEBUG(D_NET, "kibnal_invalidate: 0x%Lx: invalidating %p : %d\n", 
-                                        nal->ni.nid, base, extent);
-  return;
-}
-
-
-//
-// validate 
-// because everything is in kernel space (LUSTRE)
-// there is no need to mark a piece of user memory in use by
-// the system
-//
-// cb_validate
-
-int kibnal_validate(nal_cb_t        *nal,  
-                           void            *base, 
-                           size_t          extent, 
-                           void            **addrkey)
-{
-  // do nothing 
-  CDEBUG(D_NET, "kibnal_validate: 0x%Lx: validating %p : %d\n", 
-                                        nal->ni.nid, base, extent);
-
-  return 0;
-}
-
-
-//
-// log messages from kernel space 
-// printk() is used 
-//
-// cb_printf
-
-void kibnal_printf(nal_cb_t *nal, const char *fmt, ...)
-{
-        va_list ap;
-        char    msg[256];
-
-        if (portal_debug & D_NET) {
-                va_start( ap, fmt );
-                vsnprintf( msg, sizeof(msg), fmt, ap );
-                va_end( ap );
-
-                printk("CPUId: %d %s",smp_processor_id(), msg);
-        }
-}
-
-//
-// clear interrupt
-// use spin_lock to lock protected area such as MD, ME...
-// so a process can enter a protected area and do some works
-// this won't physicall disable interrup but use a software 
-// spin-lock to control some protected areas 
-//
-// cb_cli 
-
-void kibnal_cli(nal_cb_t *nal, unsigned long *flags) 
-{ 
-        kibnal_data_t *data= nal->nal_data;
-
-        CDEBUG(D_NET, "kibnal_cli \n");
-
-        spin_lock_irqsave(&data->kib_dispatch_lock,*flags);
-
-}
-
-//
-// set interrupt
-// use spin_lock to unlock protected area such as MD, ME...
-// this won't physicall enable interrup but use a software 
-// spin-lock to control some protected areas 
-//
-// cb_sti
-
-void kibnal_sti(nal_cb_t *nal, unsigned long *flags)
-{
-        kibnal_data_t *data= nal->nal_data;
-
-        CDEBUG(D_NET, "kibnal_sti \n");
-
-        spin_unlock_irqrestore(&data->kib_dispatch_lock,*flags);
-}
-
-
-
-//
-// nic distance 
-// 
-// network distance doesn't mean much for this nal 
-// here we only indicate 
-//      0 - operation is happened on the same node 
-//      1 - operation is happened on different nodes 
-//          router will handle the data routing 
-//
-// cb_dist
-
-int kibnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
-{
-        CDEBUG(D_NET, "kibnal_dist \n");
-
-        if ( nal->ni.nid == nid ) {
-                *dist = 0;
-        } 
-        else {
-                *dist = 1;
-        }
-
-        return 0; // always retrun 0 
-}
-
-
-//
-// This is the cb_send() on IB based interconnect system
-// prepare a data package and use VAPI_post_sr() to send it
-// down-link out-going message 
-//
-
-
-int
-kibnal_send(nal_cb_t        *nal,
-            void            *private,
-            lib_msg_t       *cookie,
-            ptl_hdr_t       *hdr,
-            int              type,
-            ptl_nid_t        nid,
-            ptl_pid_t        pid,
-            unsigned int     niov,
-            ptl_kiov_t      *iov,
-            size_t           len)
-{
-        
-        int           rc=0;
-        void         *buf = NULL; 
-        unsigned long buf_length = sizeof(ptl_hdr_t) + len;
-        int           expected_buf_size = 0;
-        VAPI_ret_t    vstat;
-
-        PROF_START(kibnal_send); // time stamp send start 
-
-        CDEBUG(D_NET,"kibnal_send: sending %d bytes from %p to nid: 0x%Lx pid %d\n",
-               buf_length, iov, nid, HCA_PORT_1);
-
-
-        // do I need to check the gateway information
-        // do I have problem to send direct 
-        // do I have to forward a data packet to gateway
-        // 
-        // The current connection is back-to-back 
-        // I always know that data will be send from one-side to
-        // the other side
-        //
-        
-        //
-        //  check data buffer size 
-        //
-        //  MSG_SIZE_SMALL 
-        //      regular post send 
-        //  
-        //  MSG_SIZE_LARGE
-        //      rdma write
-        
-        if(buf_length <= SMALL_MSG_SIZE) {  
-           expected_buf_size = MSG_SIZE_SMALL;
-        } 
-        else { 
-          if(buf_length > MAX_MSG_SIZE) { 
-             CERROR("kibnal_send:request exceeds Transmit data size (%d).\n",
-                      MAX_MSG_SIZE);
-             rc = PTL_FAIL;
-             return rc;
-          }
-          else {
-             expected_buf_size = MSG_SIZE_LARGE; // this is a large data package 
-          } 
-        }
-                
-        // prepare data packet for send operation 
-        //
-        // allocate a data buffer "buf" with size of buf_len(header + payload)
-        //                 ---------------
-        //  buf            | hdr         |  size = sizeof(ptl_hdr_t)
-        //                 --------------
-        //                 |payload data |  size = len
-        //                 ---------------
-        
-        // copy header to buf 
-        memcpy(buf, hdr, sizeof(ptl_hdr_t));
-
-        // copy payload data from iov to buf
-        // use portals library function lib_copy_iov2buf()
-        
-        if (len != 0)
-           lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t),
-                            niov, 
-                            iov, 
-                            len);
-
-        // buf is ready to do a post send 
-        // the send method is base on the buf_size 
-
-        CDEBUG(D_NET,"ib_send %d bytes (size %d) from %p to nid: 0x%Lx "
-               " port %d\n", buf_length, expected_buf_size, iov, nid, HCA_PORT_1);
-
-        switch(expected_buf_size) {
-          case MSG_SIZE_SMALL:
-            // send small message 
-            if((vstat = Send_Small_Msg(buf, buf_length)) != VAPI_OK){
-                CERROR("Send_Small_Msg() is failed\n");
-            } 
-            break;
-
-          case MSG_SIZE_LARGE:
-            // send small message 
-            if((vstat = Send_Large_Msg(buf, buf_length)) != VAPI_OK){
-                CERROR("Send_Large_Msg() is failed\n");
-            } 
-            break;
-
-          default:
-            CERROR("Unknown message size %d\n", expected_buf_size);
-            break;
-        }
-
-        PROF_FINISH(kibnal_send); // time stapm of send operation 
-
-        rc = PTL_OK;
-
-        return rc; 
-}
-
-//
-// kibnal_send_pages
-//
-// no support 
-//
-// do you need this 
-//
-int kibnal_send_pages(nal_cb_t * nal, 
-                      void *private, 
-                      lib_msg_t * cookie,
-                      ptl_hdr_t * hdr, 
-                      int type, 
-                      ptl_nid_t nid, 
-                      ptl_pid_t pid,
-                      unsigned int niov, 
-                      ptl_kiov_t *iov, 
-                      size_t mlen)
-{
-   int rc = PTL_FAIL;
-
-   CDEBUG(D_NET, "kibnal_send_pages\n");
-
-   // do nothing now for Infiniband 
-   
-   return rc;
-}
-
-
-
-
-
-//
-// kibnal_fwd_packet 
-//
-// no support 
-//
-// do you need this 
-//
-void kibnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
-{
-        CDEBUG(D_NET, "forwarding not implemented\n");
-        return;
-      
-}
-
-//
-// kibnal_callback 
-//
-// no support 
-//
-// do you need this 
-//
-void kibnal_callback(nal_cb_t * nal, 
-                           void *private, 
-                           lib_eq_t *eq,
-                           ptl_event_t *ev)
-{
-        CDEBUG(D_NET,  "callback not implemented\n");
-        return PTL_OK;
-}
-
-
-/* Process a received portals packet */
-//
-//  conver receiving data in to PORTALS header 
-//
-
-void kibnal_rx(kibnal_data_t    *kib, 
-                      VAPI_virt_addr_t buffer_addr,
-                      u_int32_t        buffer_len,
-                      u_int32_t        buffer_size,
-                      unsigned int     priority) 
-{
-        ptl_hdr_t  *hdr = (ptl_hdr_t *)  buffer_addr; // case to ptl header format 
-        kibnal_rx_t krx;
-
-        CDEBUG(D_NET,"kibnal_rx: buf %p, len %ld\n", buffer_addr, buffer_len);
-
-        if ( buffer_len < sizeof( ptl_hdr_t ) ) {
-                /* XXX what's this for? */
-                if (kib->kib_shuttingdown)
-                        return;
-                CERROR("kibnal_rx: did not receive complete portal header, "
-                       "len= %ld", buffer_len);
-
-                return;
-        }
-
-       // typedef struct {
-       //         char             *krx_buffer; // pointer to receiving buffer
-       //         unsigned long     krx_len;  // length of buffer
-       //         unsigned int      krx_size; //
-       //         unsigned int      krx_priority; // do we need this
-       //         struct list_head  krx_item;
-       // } kibnal_rx_t;
-       //
-        krx.krx_buffer    = hdr;
-        krx.krx_len       = buffer_len;
-        krx.krx_size      = buffer_size;
-        krx.krx_priority  = priority;
-
-        if ( hdr->dest_nid == kibnal_lib.ni.nid ) {
-           // this is my data 
-           PROF_START(lib_parse);
-
-           lib_parse(&kibnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx);
-
-           PROF_FINISH(lib_parse);
-        } else {
-           /* forward to gateway */
-           // Do we expect this happened ?
-           //      
-           CERROR("kibnal_rx: forwarding not implemented yet");
-        }
-
-        return;
-}
-
-
-
-
-//
-// kibnal_recv_pages 
-//
-// no support 
-//
-// do you need this 
-//
-int
-kibnal_recv_pages(nal_cb_t * nal, 
-                  void *private, 
-                  lib_msg_t * cookie,
-                  unsigned int niov, 
-                  ptl_kiov_t *iov, 
-                  size_t mlen,
-                  size_t rlen)
-{
-
-  CDEBUG(D_NET, "recv_pages not implemented\n");
-  return PTL_FAIL;
-       
-}
-
-
-int 
-kibnal_recv(nal_cb_t     *nal,
-            void         *private,
-            lib_msg_t    *cookie,
-            unsigned int  niov,
-            struct iovec *iov,
-            size_t        mlen,
-            size_t        rlen)
-{
-        kibnal_rx_t *krx = private;
-
-        CDEBUG(D_NET,"kibnal_recv: mlen=%d, rlen=%d\n", mlen, rlen);
-
-        /* What was actually received must be >= what sender claims to
-         * have sent. */
-        LASSERT (mlen <= rlen);
-
-        if (krx->krx_len < sizeof (ptl_hdr_t) + rlen)
-                return (PTL_FAIL);
-
-        PROF_START(kibnal_recv);
-
-        if(mlen != 0) {
-                PROF_START(memcpy);
-                lib_copy_buf2iov (niov, iov, krx->krx_buffer +
-                                  sizeof (ptl_hdr_t), mlen);
-                PROF_FINISH(memcpy);
-        }
-
-        PROF_START(lib_finalize);
-        
-        lib_finalize(nal, private, cookie, PTL_OK);
-        
-        PROF_FINISH(lib_finalize);
-        PROF_FINISH(kibnal_recv);
-
-        return PTL_OK;
-}
-
-//
-// kibnal_map 
-// no support 
-// do you need this 
-//
-int kibnal_map(nal_cb_t * nal, 
-               unsigned int niov, 
-               struct iovec *iov,
-               void **addrkey)
-{
-  CDEBUG(D_NET, "map not implemented\n");
-  return PTL_OK; 
-}
-
-
-
-//
-// kibnal_unmap
-//
-// no support 
-//
-// do you need this 
-//
-void kibnal_unmap(nal_cb_t * nal, 
-                  unsigned int niov, 
-                  struct iovec *iov,
-                  void **addrkey)
-{
-  CDEBUG(D_NET, "unmap not implemented\n");
-  return;
-}
-
-
-
-//
-// kibnal_map_pages 
-// no support 
-// do you need this 
-/* as (un)map, but with a set of page fragments */
-int kibnal_map_pages(nal_cb_t * nal, 
-                     unsigned int niov, 
-                     ptl_kiov_t *iov,
-                     void **addrkey)
-{
-  CDEBUG(D_NET, "map_pages not implemented\n");
-  return PTL_OK;
-}
-
-
-
-//
-// kibnal_unmap_pages 
-//
-// no support 
-//
-// do you need this 
-//
-void kibnal_unmap_pages(nal_cb_t * nal, 
-                               unsigned int niov, 
-                               ptl_kiov_t *iov,
-                               void **addrkey)
-{
-  CDEBUG(D_NET, "unmap_pages not implemented\n");
-  return ;
-}
-
-
-int kibnal_end(kibnal_data_t *kib)
-{
-
-  /* wait for sends to finish ? */
-  /* remove receive buffers */
-  /* shutdown receive thread */
-
-  CDEBUG(D_NET, "kibnal_end\n");
-  IB_Close_HCA();
-
-  return 0;
-}
-
-
-//
-//
-//  asynchronous event handler: response to some unexpetced operation errors 
-//    
-//  void async_event_handler(VAPI_hca_hndl_t      hca_hndl,
-//                           VAPI_event_record_t *event_record_p,
-//                           void*                private_data)
-//  the HCA drive will prepare evetn_record_p                        
-//
-//  this handler is registered with VAPI_set_async_event_handler()
-//  VAPI_set_async_event_handler() is issued when an HCA is created 
-//
-//
-void async_event_handler(VAPI_hca_hndl_t      hca_hndl,
-                         VAPI_event_record_t *event_record_p,  
-                         void*                private_data)
-{
-  //
-  // * event_record_p is prepared by the system when an async
-  //   event happened
-  // * what to do with private_data 
-  // * do we expect more async events happened if so what are they 
-  //
-  //   only log ERROR message now 
-
-  switch (event_record_p->type) {
-    case VAPI_PORT_ERROR:
-         printk("Got PORT_ERROR event. port number=%d\n", 
-                 event_record_p->modifier.port_num);
-         break;
-    case VAPI_PORT_ACTIVE:
-         printk("Got PORT_ACTIVE event. port number=%d\n", 
-                 event_record_p->modifier.port_num);
-         break;
-    case VAPI_QP_PATH_MIGRATED:    /*QP*/
-         printk("Got P_PATH_MIGRATED event. qp_hndl=%lu\n", 
-                 event_record_p->modifier.qp_hndl);
-         break;
-    case VAPI_EEC_PATH_MIGRATED:   /*EEC*/
-         printk("Got EEC_PATH_MIGRATED event. eec_hndl=%d\n", 
-                 event_record_p->modifier.eec_hndl);
-         break;
-    case VAPI_QP_COMM_ESTABLISHED: /*QP*/
-         printk("Got QP_COMM_ESTABLISHED event. qp_hndl=%lu\n", 
-                 event_record_p->modifier.qp_hndl);
-         break;
-    case VAPI_EEC_COMM_ESTABLISHED: /*EEC*/
-         printk("Got EEC_COMM_ESTABLISHED event. eec_hndl=%d\n",
-                 event_record_p->modifier.eec_hndl);
-         break;
-    case VAPI_SEND_QUEUE_DRAINED:  /*QP*/
-         printk("Got SEND_QUEUE_DRAINED event. qp_hndl=%lu\n", 
-                 event_record_p->modifier.qp_hndl);
-         break;
-    case VAPI_CQ_ERROR:            /*CQ*/
-         printk("Got CQ_ERROR event. cq_hndl=%lu\n", 
-                 event_record_p->modifier.cq_hndl);
-         break;
-    case VAPI_LOCAL_WQ_INV_REQUEST_ERROR: /*QP*/
-         printk("Got LOCAL_WQ_INV_REQUEST_ERROR event. qp_hndl=%lu\n", 
-                 event_record_p->modifier.qp_hndl);
-         break;
-    case VAPI_LOCAL_WQ_ACCESS_VIOL_ERROR: /*QP*/
-         printk("Got LOCAL_WQ_ACCESS_VIOL_ERROR event. qp_hndl=%lu\n", 
-                 event_record_p->modifier.qp_hndl);
-         break;
-    case VAPI_LOCAL_WQ_CATASTROPHIC_ERROR: /*QP*/
-         printk("Got LOCAL_WQ_CATASTROPHIC_ERROR event. qp_hndl=%lu\n", 
-                 event_record_p->modifier.qp_hndl);
-         break;
-    case VAPI_PATH_MIG_REQ_ERROR:  /*QP*/
-         printk("Got PATH_MIG_REQ_ERROR event. qp_hndl=%lu\n", 
-                 event_record_p->modifier.qp_hndl);
-         break;
-    case VAPI_LOCAL_CATASTROPHIC_ERROR: /*none*/
-         printk("Got LOCAL_CATASTROPHIC_ERROR event. \n");
-         break;
-    default:
-         printk(":got non-valid event type=%d. IGNORING\n",
-                    event_record_p->type);
-  }
-
-}
-
-
-
-
-VAPI_wr_id_t 
-search_send_buf(int buf_length)
-{
-  VAPI_wr_id_t send_id = -1;
-  u_int32_t    i;
-  int          flag = NO;
-  int          loop_count = 0;  
-
-  CDEBUG(D_NET, "search_send_buf \n");
-  
-  while((flag == NO) && (loop_count < MAX_LOOP_COUNT)) {
-    for(i=0; i < NUM_ENTRY; i++) {
-      // problem about using spinlock
-      spin_lock(&MSB_mutex[i]);
-      if(MSbuf_list[i].status == BUF_REGISTERED)  {
-        MSbuf_list[i].status = BUF_INUSE;// make send buf as inuse
-        flag =  YES;
-        spin_unlock(&MSB_mutex[i]);
-        break;
-      }
-      else
-        spin_unlock(&MSB_mutex[i]); 
-    }
-
-    loop_count++;
-    schedule_timeout(200); // wait for a while 
-  }
-   
-  if(flag == NO)  {
-    CDEBUG(D_NET, "search_send_buf: could not locate an entry in MSbuf_list\n");
-  }
-
-  send_id = (VAPI_wr_id_t ) i;
-
-  return send_id;
-}
-
-
-
-VAPI_wr_id_t 
-search_RDMA_recv_buf(int buf_length)
-{
-  VAPI_wr_id_t recv_id = -1;
-  u_int32_t    i;
-  int          flag = NO;
-  int          loop_count = 0;  
-
-  CDEBUG(D_NET, "search_RDMA_recv_buf\n");
-
-  while((flag == NO) && (loop_count < MAX_LOOP_COUNT)) {
-
-    for(i=NUM_ENTRY; i < NUM_MBUF; i++) {
-
-      spin_lock(&MSB_mutex[i]);
-
-      if((MRbuf_list[i].status == BUF_REGISTERED)  &&
-         (MRbuf_list[i].buf_size >= buf_length)) {
-          MSbuf_list[i].status = BUF_INUSE;// make send buf as inuse
-          flag =  YES;
-          spin_unlock(&MSB_mutex[i]);
-          break;
-      }
-      else
-        spin_unlock(&MSB_mutex[i]);
-    }
-
-    loop_count++;
-
-    schedule_timeout(200); // wait for a while 
-  }
-   
-  if(flag == NO)  {
-    CERROR("search_RDMA_recv_buf: could not locate an entry in MBbuf_list\n");
-  }
-
-  recv_id = (VAPI_wr_id_t ) i;
-
-  return recv_id;
-
-}
-
-
-
-
-
-
-
-VAPI_ret_t Send_Small_Msg(char *buf, int buf_length)
-{
- VAPI_ret_t           vstat;
- VAPI_sr_desc_t       sr_desc;
- VAPI_sg_lst_entry_t  sr_sg;
- QP_info              *qp;
- VAPI_wr_id_t         send_id;
-
- CDEBUG(D_NET, "Send_Small_Msg\n");
-
- send_id = search_send_buf(buf_length); 
-
- if(send_id < 0){
-   CERROR("Send_Small_Msg: Can not find a QP \n");
-   return(~VAPI_OK);
- }
-
- qp = &QP_list[(int) send_id];
-
- // find a suitable/registered send_buf from MSbuf_list
- CDEBUG(D_NET, "Send_Small_Msg: current send id  %d \n", send_id);
-
- sr_desc.opcode    = VAPI_SEND;
- sr_desc.comp_type = VAPI_SIGNALED;
- sr_desc.id        =  send_id;
-
-
- // scatter and gather info 
- sr_sg.len  = buf_length;
- sr_sg.lkey = MSbuf_list[send_id].mr.l_key; // use send MR 
-
- sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[send_id].buf_addr;
-
- // copy data to register send buffer 
- memcpy(&sr_sg.addr, buf, buf_length);
-
- sr_desc.sg_lst_p = &sr_sg;
- sr_desc.sg_lst_len = 1; // only 1 entry is used 
- sr_desc.fence = TRUE;
- sr_desc.set_se = FALSE;
-
- // call VAPI_post_sr to send out this data 
- vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc);
-
- if (vstat != VAPI_OK) {
-    CERROR("VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat));
- }
-
- CDEBUG(D_NET, "VAPI_post_sr success.\n");
-
- return (vstat);
-
-}
-
-
-
-
-VAPI_wr_id_t
-RTS_handshaking_protocol(int buf_length) 
-{
-
- VAPI_ret_t           vstat;
- VAPI_sr_desc_t       sr_desc;
- VAPI_sg_lst_entry_t  sr_sg;
- VAPI_wr_id_t         send_id;
-
- RDMA_Info_Exchange   rdma_info;
-
- rdma_info.opcode     = Ready_To_send;
- rdma_info.buf_length = buf_length; 
- rdma_info.raddr      = (VAPI_virt_addr_t) 0;
- rdma_info.rkey       = (VAPI_rkey_t) 0 ; 
-
- QP_info              *qp;
-
- CDEBUG(D_NET, "RTS_handshaking_protocol\n");
-
- // find a suitable/registered send_buf from MSbuf_list
- send_id = search_send_buf(sizeof(RDMA_Info_Exchange));   
-
- qp = &QP_list[(int) send_id];
-
- CDEBUG(D_NET, "RTS_CTS: current send id  %d \n", send_id);
- sr_desc.opcode    = VAPI_SEND;
- sr_desc.comp_type = VAPI_SIGNALED;
- sr_desc.id        = send_id + RDMA_RTS_ID;// this RTS mesage ID 
-
- // scatter and gather info 
- sr_sg.len  = sizeof(RDMA_Info_Exchange);
- sr_sg.lkey = MSbuf_list[send_id].mr.l_key; // use send MR 
- sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[send_id].buf_addr;
-
- // copy data to register send buffer 
- memcpy(&sr_sg.addr, &rdma_info, sizeof(RDMA_Info_Exchange));
-
- sr_desc.sg_lst_p = &sr_sg;
- sr_desc.sg_lst_len = 1; // only 1 entry is used 
- sr_desc.fence = TRUE;
- sr_desc.set_se = FALSE;
-
- // call VAPI_post_sr to send out this RTS message data 
- vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc);
-
- if (vstat != VAPI_OK) {
-    CERROR("RTS: VAPI_post_sr failed (%s).\n",VAPI_strerror_sym(vstat));
- }
-
- return send_id;
-
-}
-
-
-
-// create local receiving Memory Region for a HCA
-VAPI_ret_t
-createMemRegion_RDMA(VAPI_hca_hndl_t  hca_hndl,
-                     VAPI_pd_hndl_t   pd_hndl,
-                     char            *bufptr,
-                     int              buf_length,
-                     VAPI_mr_hndl_t   *rep_mr_hndl,
-                     VAPI_mrw_t       *rep_mr)
-{
-  VAPI_ret_t      vstat;
-  VAPI_mrw_t      mrw;
-  
-  CDEBUG(D_NET, "createMemRegion_RDMA\n");
-
-  // memory region address and size of memory region
-  // allocate a block of memory for this HCA 
-  // RDMA data buffer
-  
-  
-  if(bufptr == NULL) {
-    // need to allcate a local buffer to receive data from a
-    // remore VAPI_RDMA_WRITE_IMM
-    PORTAL_ALLOC(bufptr, buf_length);
-  }
-
-  if(bufptr == NULL) {
-    CDEBUG(D_MALLOC, "Failed to malloc a block of RDMA receiving memory, size %d\n",
-                                    buf_length);
-    return(VAPI_ENOMEM);
-  }
-
-  /* Register RDAM data Memory region */
-  CDEBUG(D_NET, "Register a RDMA data memory region\n");
-
-  mrw.type   = VAPI_MR;
-  mrw.pd_hndl= pd_hndl;
-  mrw.start  = (VAPI_virt_addr_t )(MT_virt_addr_t )bufptr;
-  mrw.size   = buf_length;
-  mrw.acl    = VAPI_EN_LOCAL_WRITE  | 
-               VAPI_EN_REMOTE_WRITE | 
-               VAPI_EN_REMOTE_READ;
-
-  // register send memory region
-  vstat = VAPI_register_mr(hca_hndl,
-                           &mrw,
-                           rep_mr_hndl,
-                           rep_mr);
-
-  // this memory region is going to be reused until deregister is called
-  if (vstat != VAPI_OK) {
-     CERROR("Failed registering a mem region Addr=%p, Len=%d. %s\n",
-             bufptr, buf_length, VAPI_strerror(vstat));
-  }
-
-  return(vstat);
-
-}
-
-
-
-RDMA_Info_Exchange  Local_rdma_info;
-
-int insert_MRbuf_list(int buf_lenght)
-{
-  int  recv_id = NUM_ENTRY;      
-
-  CDEBUG(D_NET, "insert_MRbuf_list\n");
-
-  for(recv_id= NUM_ENTRY; recv_id < NUM_MBUF; recv_id++){
-       if(BUF_UNREGISTERED == MRbuf_list[recv_id].status)  {
-         MRbuf_list[recv_id].status   = BUF_UNREGISTERED;
-         MRbuf_list[recv_id].buf_size = buf_lenght;
-         break;
-       }
-  }
-
-  return recv_id;
-
-}  
-
-VAPI_wr_id_t
-CTS_handshaking_protocol(RDMA_Info_Exchange *rdma_info) 
-{
-
- VAPI_ret_t           vstat;
- VAPI_sr_desc_t       sr_desc;
- VAPI_sg_lst_entry_t  sr_sg;
- QP_info             *qp;
- VAPI_wr_id_t         send_id;
- VAPI_mr_hndl_t       rep_mr_hndl;
- VAPI_mrw_t           rep_mr;
- int                  recv_id;
- char                *bufptr = NULL;
-
- // search MRbuf_list for an available entry that
- // has registered data buffer with size equal to rdma_info->buf_lenght
-
- CDEBUG(D_NET, "CTS_handshaking_protocol\n");
-
- // register memory buffer for RDAM operation
-
- vstat = createMemRegion_RDMA(Hca_hndl,
-                              Pd_hndl,
-                              bufptr, 
-                              rdma_info->buf_length,
-                              &rep_mr_hndl,
-                              &rep_mr);
-
-
- Local_rdma_info.opcode            = Clear_To_send;
- Local_rdma_info.recv_rdma_mr      = rep_mr;
- Local_rdma_info.recv_rdma_mr_hndl = rep_mr_hndl;
-
- if (vstat != VAPI_OK) {
-    CERROR("CST_handshaking_protocol: Failed registering a mem region"
-           "Len=%d. %s\n", rdma_info->buf_length, VAPI_strerror(vstat));
-    Local_rdma_info.flag = RDMA_BUFFER_UNAVAILABLE;
- }
- else {
-    // successfully allcate reserved RDAM data buffer 
-    recv_id = insert_MRbuf_list(rdma_info->buf_length);   
-
-    if(recv_id >=  NUM_ENTRY) { 
-      MRbuf_list[recv_id].buf_addr     = rep_mr.start;
-      MRbuf_list[recv_id].mr           = rep_mr;
-      MRbuf_list[recv_id].mr_hndl      = rep_mr_hndl;
-      MRbuf_list[recv_id].ref_count    = 0;
-      Local_rdma_info.flag             = RDMA_BUFFER_RESERVED;
-      Local_rdma_info.buf_length       = rdma_info->buf_length; 
-      Local_rdma_info.raddr            = rep_mr.start;
-      Local_rdma_info.rkey             = rep_mr.r_key; 
-    }
-    else {
-      CERROR("Can not find an entry in MRbuf_list - how could this happen\n");  
-    }
- }
-
- // find a suitable/registered send_buf from MSbuf_list
- send_id = search_send_buf(sizeof(RDMA_Info_Exchange)); 
- CDEBUG(D_NET, "CTS: current send id  %d \n", send_id);
- sr_desc.opcode    = VAPI_SEND;
- sr_desc.comp_type = VAPI_SIGNALED;
- sr_desc.id        = send_id + RDMA_CTS_ID; // this CST message ID 
-
- // scatter and gather info 
- sr_sg.len  = sizeof(RDMA_Info_Exchange);
- sr_sg.lkey = MSbuf_list[send_id].mr.l_key; // use send MR 
- sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[send_id].buf_addr;
-
- // copy data to register send buffer 
- memcpy(&sr_sg.addr, &Local_rdma_info, sizeof(RDMA_Info_Exchange));
-
- sr_desc.sg_lst_p   = &sr_sg;
- sr_desc.sg_lst_len = 1; // only 1 entry is used 
- sr_desc.fence = TRUE;
- sr_desc.set_se = FALSE;
-
- // call VAPI_post_sr to send out this RTS message data 
- vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc);
-
- if (vstat != VAPI_OK) {
-    CERROR("CTS: VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat));
- }
-
-
-}
-
-
-
-VAPI_ret_t Send_Large_Msg(char *buf, int buf_length)
-{
-  VAPI_ret_t           vstat;
-  VAPI_sr_desc_t       sr_desc;
-  VAPI_sg_lst_entry_t  sr_sg;
-  QP_info             *qp;
-  VAPI_mrw_t           rep_mr; 
-  VAPI_mr_hndl_t       rep_mr_hndl;
-  int                  send_id;
-  VAPI_imm_data_t      imm_data = 0XAAAA5555;
-
-
-  CDEBUG(D_NET, "Send_Large_Msg: Enter\n");
-
-  // register this large buf 
-  // don't need to copy this buf to send buffer
-  vstat = createMemRegion_RDMA(Hca_hndl,
-                               Pd_hndl,
-                               buf,
-                               buf_length,
-                               &rep_mr_hndl,
-                               &rep_mr);
-
-  if (vstat != VAPI_OK) {
-    CERROR("Send_Large_M\sg:  createMemRegion_RDMAi() failed (%s).\n",
-                        VAPI_strerror(vstat));
-  }
-  
-
-  Local_rdma_info.send_rdma_mr      = rep_mr;
-  Local_rdma_info.send_rdma_mr_hndl = rep_mr_hndl;
-
-  //
-  //     Prepare descriptor for send queue
-  //
-  // ask for a remote rdma buffer with size buf_lenght
-  send_id = RTS_handshaking_protocol(buf_length); 
-
-  qp = &QP_list[send_id];
-
-  // wait for CTS message receiving from remote node 
-  while(1){
-     if(YES == Cts_Message_arrived) {
-        // receive CST message from remote node 
-        // Rdma_info is available for use
-        break;
-     }
-     schedule_timeout(RTS_CTS_TIMEOUT);
-  }
-  
-  sr_desc.id        = send_id + RDMA_OP_ID;
-  sr_desc.opcode    = VAPI_RDMA_WRITE_WITH_IMM;
-  sr_desc.comp_type = VAPI_SIGNALED;
-
-  // scatter and gather info 
-  sr_sg.len  = buf_length;
-
-  // rdma mr 
-  sr_sg.lkey = rep_mr.l_key;  
-  sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) rep_mr.start;
-  sr_desc.sg_lst_p = &sr_sg;
-  sr_desc.sg_lst_len = 1; // only 1 entry is used 
-
-  // immediate data - not used here 
-  sr_desc.imm_data = imm_data;
-  sr_desc.fence = TRUE;
-  sr_desc.set_se = FALSE;
-
-  // RDAM operation only
-  // raddr and rkey is receiving from remote node  
-  sr_desc.remote_addr = Rdma_info.raddr;
-  sr_desc.r_key       = Rdma_info.rkey;
-
-  // call VAPI_post_sr to send out this data 
-  vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc);
-
-  if (vstat != VAPI_OK) {
-     CERROR("VAPI_post_sr failed (%s).\n",VAPI_strerror_sym(vstat));
-  }
-
-}
-
-
-
-
-
-
-//
-//  repost_recv_buf
-//  post a used recv buffer back to recv WQE list 
-//  wrq_id is used to indicate the starting position of recv-buffer 
-//
-VAPI_ret_t 
-repost_recv_buf(QP_info      *qp,
-                VAPI_wr_id_t  wrq_id) 
-{
-  VAPI_rr_desc_t       rr;
-  VAPI_sg_lst_entry_t  sg_entry;
-  VAPI_ret_t           ret;
-
-  CDEBUG(D_NET, "repost_recv_buf\n");
-
-  sg_entry.lkey = MRbuf_list[wrq_id].mr.l_key;
-  sg_entry.len  = MRbuf_list[wrq_id].buf_size;
-  sg_entry.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MRbuf_list[wrq_id].buf_addr;
-  rr.opcode     = VAPI_RECEIVE;
-  rr.comp_type  = VAPI_SIGNALED; /* All with CQE (IB compliant) */
-  rr.sg_lst_len = 1; /* single buffers */
-  rr.sg_lst_p   = &sg_entry;
-  rr.id         = wrq_id; /* WQE id used is the index to buffers ptr array */
-
-  ret= VAPI_post_rr(qp->hca_hndl,qp->qp_hndl,&rr);
-     
-  if (ret != VAPI_OK){
-     CERROR("failed reposting RQ WQE (%s) buffer \n",VAPI_strerror_sym(ret));
-     return ret;
-  }
-
-  CDEBUG(D_NET, "Successfully reposting an RQ WQE %d recv bufer \n", wrq_id);
-
-  return ret ;
-}
-                       
-//
-// post_recv_bufs
-//     post "num_o_bufs" for receiving data
-//      each receiving buf (buffer starting address, size of buffer)
-//      each buffer is associated with an id 
-//
-int 
-post_recv_bufs(VAPI_wr_id_t  start_id)
-{
-  int i;
-  VAPI_rr_desc_t       rr;
-  VAPI_sg_lst_entry_t  sg_entry;
-  VAPI_ret_t           ret;
-
-  CDEBUG(D_NET, "post_recv_bufs\n");
-
-  for(i=0; i< NUM_ENTRY; i++) {
-    sg_entry.lkey = MRbuf_list[i].mr.l_key;
-    sg_entry.len  = MRbuf_list[i].buf_size;
-    sg_entry.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MRbuf_list[i].buf_addr;
-    rr.opcode     = VAPI_RECEIVE;
-    rr.comp_type  = VAPI_SIGNALED;  /* All with CQE (IB compliant) */
-    rr.sg_lst_len = 1; /* single buffers */
-    rr.sg_lst_p   = &sg_entry;
-    rr.id         = start_id+i; /* WQE id used is the index to buffers ptr array */
-
-    ret= VAPI_post_rr(QP_list[i].hca_hndl,QP_list[i].qp_hndl, &rr);
-    if (ret != VAPI_OK) {
-       CERROR("failed posting RQ WQE (%s)\n",VAPI_strerror_sym(ret));
-       return i;
-    } 
-  }
-
-  return i; /* num of buffers posted */
-}
-                       
-int 
-post_RDMA_bufs(QP_info      *qp, 
-               void         *buf_array,
-               unsigned int  num_bufs,
-               unsigned int  buf_size,
-               VAPI_wr_id_t  start_id)
-{
-
-  CDEBUG(D_NET, "post_RDMA_bufs \n");
-  return YES;
-}
-
-
-
-//
-// LIB NAL
-// assign function pointers to theirs corresponding entries
-//
-
-nal_cb_t kibnal_lib = {
-        nal_data:       &kibnal_data,  /* NAL private data */
-        cb_send:        kibnal_send,
-        cb_send_pages:  NULL, // not implemented  
-        cb_recv:        kibnal_recv,
-        cb_recv_pages:  NULL, // not implemented 
-        cb_read:        kibnal_read,
-        cb_write:       kibnal_write,
-        cb_callback:    NULL, // not implemented 
-        cb_malloc:      kibnal_malloc,
-        cb_free:        kibnal_free,
-        cb_map:         NULL, // not implemented 
-        cb_unmap:       NULL, // not implemented 
-        cb_map_pages:   NULL, // not implemented 
-        cb_unmap_pages: NULL, // not implemented 
-        cb_printf:      kibnal_printf,
-        cb_cli:         kibnal_cli,
-        cb_sti:         kibnal_sti,
-        cb_dist:        kibnal_dist // no used at this moment 
-};
diff --git a/lustre/portals/knals/ibnal/ibnal_send_recv_self_testing.c b/lustre/portals/knals/ibnal/ibnal_send_recv_self_testing.c
deleted file mode 100644 (file)
index 82defdb..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *  *
- *  * Based on ksocknal, qswnal, and gmnal
- *  *
- *  * Copyright (C) 2003 LANL
- *  *   Author: HB Chen <hbchen@lanl.gov>
- *  *   Los Alamos National Lab
- *  *
- *  *   Portals is free software; you can redistribute it and/or
- *  *   modify it under the terms of version 2 of the GNU General Public
- *  *   License as published by the Free Software Foundation.
- *  *
- *  *   Portals is distributed in the hope that it will be useful,
- *  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  *   GNU General Public License for more details.
- *  *
- *  *   You should have received a copy of the GNU General Public License
- *  *   along with Portals; if not, write to the Free Software
- *  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *  *
- *  */
-
-#include "ibnal.h"
-
-
-
-VAPI_ret_t ibnal_send_recv_self_testing()
-{
- VAPI_ret_t           vstat;
- VAPI_sr_desc_t       sr_desc;
- VAPI_sg_lst_entry_t  sr_sg;
- QP_info              *qp;
- VAPI_wr_id_t         send_id;
- int                  buf_id;
- char                 sbuf[KB_32];
- char                 rbuf[KB_32];
- int                  i;
- int                  buf_length = KB_32;
- VAPI_wc_desc_t       comp_desc;
- int                  num_send = 1;
- int                  loop_count = 0;
-
-
- printk("ibnal_send_recv_self_testing\n");
-
- memset(&sbuf, 'a', KB_32);
- memset(&rbuf, ' ', KB_32);
- send_id = 2222; 
- buf_id = 0;
-
- qp = &QP_list[0];
-
- sr_desc.opcode    = VAPI_SEND;
- sr_desc.comp_type = VAPI_SIGNALED;
-
- // scatter and gather info
- sr_sg.len  = KB_32;
- sr_sg.lkey = MSbuf_list[buf_id].mr.l_key; // use send MR
- sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[buf_id].buf_addr;
-
- // copy data to register send buffer
- memcpy(&sr_sg.addr, &buf, buf_length);
-
- sr_desc.sg_lst_p = &sr_sg;
- sr_desc.sg_lst_len = 1; // only 1 entry is used
- sr_desc.fence = TRUE;
- sr_desc.set_se = FALSE;
-
-
- // call VAPI_post_sr to send out this data
- vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc);
-
- if (vstat != VAPI_OK) {
-   printk("VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat));
- }
-
- printk("VAPI_post_sr success.\n");
-
- // poll for completion
-
- while( loop_count < 100 ){
-   vstat = VAPI_poll_cq(qp->hca_hndl, qp->cq_hndl, &comp_desc);
-   if( vstat == VAPI_OK ) {
-       if(comp_desc.opcode == VAPI_CQE_SQ_SEND_DATA ) {
-          /* SEND completion */
-         printk("received SQ completion\n");
-       }
-       else { 
-          if(comp_desc.opcode == VAPI_CQE_RQ_SEND_DATA ) {
-           /* RECEIVE completion */
-            printk("received RQ completion\n");
-            memcpy(&rbuf, (char *) MRbuf_list[buf_id].buf_addar, KB_32);
-           
-           int n;
-
-           n = memcmp($sbuf, &rbuf, KB_32);
-           printk("compare sbuf and rbuf  n = %d\n", n); 
-           
-          }
-                 else  {
-            printk("unexpected completion opcode %d \n", comp_desc.opcode);
-         }
-       }
-   }
-
-   loop_count++; 
-   schedule_timeout(500);
- }
-
- printk("end of ibnal_self_send_recv_testing\n");
-
-
-}
diff --git a/lustre/portals/knals/ibnal/uagent.c b/lustre/portals/knals/ibnal/uagent.c
deleted file mode 100644 (file)
index d7e939a..0000000
+++ /dev/null
@@ -1,391 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <pthread.h>
-
-
-#include <linux/shm.h>
-#include <linux/ipc.h>
-#include <linux/stat.h>
-#include <linux/types.h>
-
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <unistd.h>
-
-// Infiniband VAPI/EVAPI header files Mellanox MT23108 VAPI
-#include <vapi.h>
-#include <vapi_types.h>
-#include <vapi_common.h>
-#include <evapi.h>
-
-// Remote HCA Info information
- typedef struct Remote_HCA_Info {
-       unsigned long     opcode;
-       unsigned long     length;
-       IB_lid_t          dlid[256];
-       VAPI_qp_num_t     rqp_num[256];
-       VAPI_rkey_t       rkey;   // for remote RDAM request
-       unsigned long     vaddr1; // virtual address fisrt 4 bytes
-       unsigned long     vaddr2; // virtual address second 4 bytes
-       u_int32_t         size;   // size of RDMA memory buffer
-       char              dest_ip[256]; //destination server IP address 
- } Remote_HCA_Info;
-
-#define SHARED_SEGMENT_SIZE  0x10000 // 16KB shared memory between U and K
-
-// some internals opcodes for IB operations used in IBNAL
-#define SEND_QP_INFO          0X00000001
-#define RECV_QP_INFO          0X00000010
-#define DEFAULT_SOCKET_PORT   11211 
-#define LISTEN_QUEUE_SIZE     2048 
-#define DEST_IP                      "10.128.105.26"
-
-// server_thread
-// + wait for an incoming connection from remote node 
-// + receive remote HCA's data 
-//
-//
-//
-//
-// 
-void *server_thread(void *vargp)
-{
-  Remote_HCA_Info   *hca_data;
-  Remote_HCA_Info   hca_data_buffer;
-  
-  int    serverfd;
-  int    infd;
-  struct hostent  *hp;
-  struct sockaddr_in serveraddr;
-  struct sockaddr_in clientaddr;
-  int    sin_size=sizeof(struct sockaddr_in);
-  int   bytes_recv;
-  int    i;
-
-
-  hca_data = (Remote_HCA_Info *) vargp;
-  
-  if((serverfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-    printf("server_thread couldnot create a socket \n");
-    pthread_exit((void *) 0);
-  }
-  printf("server_thread create a socket \n");
-
-  bzero((char *) &serveraddr, sizeof(serveraddr));
-
-  serveraddr.sin_family = AF_INET;
-  serveraddr.sin_addr.s_addr = htons(INADDR_ANY);
-  serveraddr.sin_port = htons((unsigned short) DEFAULT_SOCKET_PORT);
-  
-  if(bind(serverfd,(struct sockaddr *)&serveraddr,sizeof(struct sockaddr)) < 0) {
-    printf("server_thread couldnot bind to a socket \n");
-    pthread_exit((void *) 0);
-  }
-
-  printf("server_thread bind to a socket \n");
-
-  if(listen(serverfd, LISTEN_QUEUE_SIZE) < 0) {
-    printf("server_thread couldnot listen to a socket \n");
-    pthread_exit((void *) 0);
-  }
-
-  printf("server_thread listen to a socket \n");
-
-  //
-  // I only expect to receive one HCA data from a remote HCA 
-  //
-  printf("server_thread: Waiting for a connection\n");
-  infd= accept(serverfd,(struct sockaddr*)&clientaddr,&sin_size);
-  printf("server_thread: Got an incoming connection");
-
-  /* receive data from socket into buffer */
-  bytes_recv = recv(infd,
-                    &hca_data_buffer,  
-                    sizeof(Remote_HCA_Info),
-                   0);
-
-  if(bytes_recv > 0) {
-/*       
-      printf("server_thread receive data\n");
-      printf("opcode is 0x%X\n", hca_data_buffer.opcode);
-      printf("length is 0x%X\n", hca_data_buffer.length);
-
-      for(i=0; i < 256; i++) {
-        printf("dlid %d is 0x%X\n", i, hca_data_buffer.dlid[i]);
-        printf("rqp_num %d is 0x%X\n", hca_data_buffer.rqp_num[i]);
-      }
-
-      printf("rkey is 0x%X\n", hca_data_buffer.rkey);
-      printf("vaddr1 is 0x%X\n", hca_data_buffer.vaddr1);
-      printf("vaddr2 is 0x%X\n", hca_data_buffer.vaddr2);
-      printf("size is 0x%X\n", hca_data_buffer.size);
-      printf("After conversion hton \n");
-      printf("opcode is 0x%X\n", htonl(hca_data_buffer.opcode));
-      printf("length is 0x%X\n", htonl(hca_data_buffer.length));
-
-      for(i=0; i < 256; i++) {
-        printf("dlid %d is 0x%X\n", htons(hca_data_buffer.dlid[i]));
-        printf("rqp_num %d is 0x%X\n", htonl(hca_data_buffer.rqp_num[i]));
-      }
-
-      printf("rkey is 0x%X\n", htonl(hca_data_buffer.rkey));
-      printf("vaddr1 is 0x%X\n", htonl(hca_data_buffer.vaddr1));
-      printf("vaddr2 is 0x%X\n", htonl(hca_data_buffer.vaddr2));
-      printf("size is 0x%X\n", htonl(hca_data_buffer.size));
-*/     
-
-      hca_data->opcode  = ntohl(hca_data_buffer.opcode); // long 
-      hca_data->length  = ntohl(hca_data_buffer.length); // long
-
-      for(i=0; i < 256; i++) {
-        hca_data->dlid[i]    = ntohs(hca_data_buffer.dlid[i]);   // u_int16
-        hca_data->rqp_num[i] = ntohl(hca_data_buffer.rqp_num[i]);// u_int32
-      }
-
-      hca_data->rkey    = ntohl(hca_data_buffer.rkey);   // u_int32
-      hca_data->vaddr1  = ntohl(hca_data_buffer.vaddr1); // first word u_int32
-      hca_data->vaddr2  = ntohl(hca_data_buffer.vaddr2); // second word u_int32
-      hca_data->size    = ntohl(hca_data_buffer.size);   // u_int32
-    }
-    else {
-      printf("server_thread receive ERROR bytes_recv = %d\n", bytes_recv);
-    }
-
-    close(infd);
-    close(serverfd);
-
-  printf("server_thread EXIT \n");
-      
-  pthread_exit((void *) 0);
-
-}
-
-//
-// client_thread 
-// + connect to a remote server_thread
-// + send local HCA's data to remote server_thread
-//
-void *client_thread(void *vargp)
-{
-
-  Remote_HCA_Info   *hca_data;
-  Remote_HCA_Info   hca_data_buffer;
-
-  int    clientfd;
-  struct hostent  *hp;
-  struct sockaddr_in clientaddr;
-  int    bytes_send;
-  int    i;
-  
-  hca_data = (Remote_HCA_Info *) vargp;
-
-  if((clientfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-    printf("client_thread couldnot create a socket \n");
-    pthread_exit((void *) 0);
-  }
-  printf("client_thread create a socket \n");
-  
-  bzero((char *) &clientaddr, sizeof(clientaddr));
-
-  clientaddr.sin_family = AF_INET;
-  clientaddr.sin_addr.s_addr = inet_addr(hca_data->dest_ip);
-  printf("client_thread get server Ip address = %s\n", hca_data->dest_ip);
-  clientaddr.sin_port = htons((unsigned short) DEFAULT_SOCKET_PORT);
-  memset(&(clientaddr.sin_zero), '\0', 8);
-
-  connect(clientfd, (struct sockaddr *) &clientaddr, sizeof(struct sockaddr));
-
-  printf("client_thread connect to  server Ip address = %s\n", hca_data->dest_ip);
-
-  hca_data_buffer.opcode  = htonl(hca_data->opcode); // long 
-  hca_data_buffer.length  = htonl(hca_data->length); // long
-
-  for(i=0; i < 256; i++) {
-    hca_data_buffer.dlid[i]    = htons(hca_data->dlid[i]);   // u_int16
-    hca_data_buffer.rqp_num[i] = htonl(hca_data->rqp_num[i]);// u_int32
-  }
-
-  hca_data_buffer.rkey    = htonl(hca_data->rkey);   // u_int32
-  hca_data_buffer.vaddr1  = htonl(hca_data->vaddr1); // first word u_int32
-  hca_data_buffer.vaddr2  = htonl(hca_data->vaddr2); // second word u_int32
-  hca_data_buffer.size    = htonl(hca_data->size);   // u_int32
-  bytes_send = send(clientfd, & hca_data_buffer, sizeof(Remote_HCA_Info), 0); 
-  
-  if(bytes_send == sizeof(Remote_HCA_Info)) {
-    printf("client_thread: send successfully \n");
-  }
-  else {
-    printf("client_thread: send failed \n");
-  }
-
-  printf("client_thread EXIT \n");
-
-  pthread_exit((void *) 0);
-}
-
-
-//
-//  main 
-//  + create a shared-memory between this main()/user address and
-//    a kernel thread/kernel address space associated with inbal 
-//    kernel module 
-//  + access local HCA's data through this shared memory 
-//
-//  + create a server_thread for receiving remote HCA's data
-//  + create a client_thread for sending out local HCA's data
-//  + after receiving remote HCA's data update this shared memory
-//
-int  main(int argc , char *argv[])
-{
-  int              segment_id;
-  struct shmid_ds  shmbuffer;
-  int              segment_size;
-  const int        shared_segment_size = sizeof(Remote_HCA_Info);
-  key_t            key = 999;
-  unsigned long    raddr;
-  Remote_HCA_Info  *shared_memory;
-  Remote_HCA_Info  exchange_hca_data;
-  Remote_HCA_Info  remote_hca_data;
-  int i; 
-
-  /* pthread */
-  pthread_t          sid;
-  pthread_t          cid;
-  pthread_attr_t     attr; 
-  int                rc, status;
-
-  char dest_ip[256];
-
-  if(argc != 2) {
-         printf("USAGE:   uagent   server_ip_address\n");
-         printf("argc = %d \n", argc);
-         exit(1);
-  }
-
-  strcpy(&exchange_hca_data.dest_ip[0], argv[1]);
-  printf("the destinational server IP address = %s\n", 
-                                      &exchange_hca_data.dest_ip); 
-
-  segment_id =  shmget(key, shared_segment_size, IPC_CREAT | 0666);
-
-  printf("sys_shmget is done segment_id = %d\n", segment_id);
-
-  shared_memory = (Remote_HCA_Info *) shmat(segment_id, 0, 0);
-
-  if(shared_memory == (char *) -1) {
-    printf("Shared memory attach failed shared_memory=%p\n",shared_memory);
-    exit(0);
-  }
-
-  printf("shared menory attached at address %p\n", shared_memory);
-
-  while (1) {
-    if(shared_memory->opcode ==  SEND_QP_INFO) {
-      printf("Local HCA data received from kernel thread\n");
-      break;
-    }
-    usleep(1000);
-    continue;
-  }
-
-  printf("Local HCA data received from kernel thread\n");
-
-  // save local HCA's data in exchange_hca_data
-  //
-  exchange_hca_data.opcode  = shared_memory->opcode;
-  exchange_hca_data.length  = shared_memory->length;
-
-  for(i=0; i < 256; i++) {
-    exchange_hca_data.dlid[i]    = shared_memory->dlid[i];
-    exchange_hca_data.rqp_num[i] = shared_memory->rqp_num[i];
-  }
-
-  exchange_hca_data.rkey    = shared_memory->rkey;
-  exchange_hca_data.vaddr1  = shared_memory->vaddr1;
-  exchange_hca_data.vaddr2  = shared_memory->vaddr2;
-  exchange_hca_data.size    = shared_memory->size;
-
-  /* Initialize and set thread detached attribute */
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-
-  /* create a server thread for procsssing incoming remote node socket data */
-  // 
-  pthread_create(&sid, 
-                 &attr, 
-                 server_thread,
-                 (Remote_HCA_Info *) &remote_hca_data);
-
-  printf("Main: created a server thread \n");
-
-  sleep(10);
-  
-  /* create a clint thread to send out local HCA data to remote node */
-  pthread_create(&cid, 
-                 &attr, 
-                 client_thread,
-                 (Remote_HCA_Info *) &exchange_hca_data);
-
-  printf("Main: created a client  thread \n");
-
-  /* synchronization between server_thread and client_thread */
-  pthread_attr_destroy(&attr);
-
-  rc = pthread_join(sid, (void **) &status);
-  if(rc) {
-    printf("Error: return code from pthread_join() is %d\n", rc);
-    exit(-1);
-  }
-
-  printf("completed join with thread %d status = %d\n", sid, status);
-
-  rc = pthread_join(cid, (void **) &status);
-  if(rc) {
-    printf("Error: return code from pthread_join() is %d\n", rc);
-    exit(-1);
-  }
-  printf("completed join with thread %d status = %d\n", cid, status);
-
-  // update shared memory with remote HCA's data 
-
-  shared_memory->opcode = RECV_QP_INFO;
-  shared_memory->length = remote_hca_data.length;
-  for(i=0; i < 256; i++) {
-    shared_memory->dlid[i]   = remote_hca_data.dlid[i];
-    shared_memory->rqp_num[i]= remote_hca_data.rqp_num[i];
-  }
-  shared_memory->rkey   = remote_hca_data.rkey;
-  shared_memory->vaddr1 = remote_hca_data.vaddr1;
-  shared_memory->vaddr2 = remote_hca_data.vaddr2;
-  shared_memory->size   = remote_hca_data.size;
-
-  sleep(5);
-
-  shared_memory->opcode = RECV_QP_INFO;
-  shared_memory->length = remote_hca_data.length;
-  for(i=0; i < 256; i++) {
-    shared_memory->dlid[i]   = remote_hca_data.dlid[i];
-    shared_memory->rqp_num[i]= remote_hca_data.rqp_num[i];
-  }
-  
-  shared_memory->rkey   = remote_hca_data.rkey;
-  shared_memory->vaddr1 = remote_hca_data.vaddr1;
-  shared_memory->vaddr2 = remote_hca_data.vaddr2;
-  shared_memory->size   = remote_hca_data.size;
-
-  sleep(10);
-  
-//  shmdt(shared_memory);
-   
-  printf("uagent is DONE \n");
-  
-
-  exit(0);
-
-}
-
similarity index 100%
rename from lustre/portals/knals/ibnal/.cvsignore
rename to lustre/portals/knals/iibnal/.cvsignore
index 48b17e9..5ed596b 100644 (file)
@@ -1,10 +1,10 @@
 .deps
 Makefile
+.*.cmd
 autoMakefile.in
 autoMakefile
 *.ko
 *.mod.c
 .*.flags
-.*.cmd
 .tmp_versions
 .depend
diff --git a/lustre/portals/knals/iibnal/Makefile.in b/lustre/portals/knals/iibnal/Makefile.in
new file mode 100644 (file)
index 0000000..e7934e2
--- /dev/null
@@ -0,0 +1,6 @@
+MODULES := kiibnal
+kiibnal-objs := iibnal.o iibnal_cb.o
+
+EXTRA_POST_CFLAGS := @IIBCPPFLAGS@
+
+@INCLUDE_RULES@
diff --git a/lustre/portals/knals/iibnal/Makefile.mk b/lustre/portals/knals/iibnal/Makefile.mk
new file mode 100644 (file)
index 0000000..0459a20
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(src)/../../Kernelenv
+
+obj-y += kiibnal.o
+kiibnal-objs := iibnal.o iibnal_cb.o
+
diff --git a/lustre/portals/knals/iibnal/autoMakefile.am b/lustre/portals/knals/iibnal/autoMakefile.am
new file mode 100644 (file)
index 0000000..251df66
--- /dev/null
@@ -0,0 +1,15 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if !CRAY_PORTALS
+if BUILD_IIBNAL
+modulenet_DATA = kiibnal$(KMODEXT)
+endif
+endif
+endif
+
+MOSTLYCLEANFILES = *.o *.ko *.mod.c
+DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h
diff --git a/lustre/portals/knals/iibnal/iibnal.c b/lustre/portals/knals/iibnal/iibnal.c
new file mode 100644 (file)
index 0000000..09908c9
--- /dev/null
@@ -0,0 +1,1713 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+nal_t                   kibnal_api;
+ptl_handle_ni_t         kibnal_ni;
+kib_tunables_t          kibnal_tunables;
+
+kib_data_t              kibnal_data = {
+        .kib_service_id = IBNAL_SERVICE_NUMBER,
+};
+
+#ifdef CONFIG_SYSCTL
+#define IBNAL_SYSCTL             202
+
+#define IBNAL_SYSCTL_TIMEOUT     1
+
+static ctl_table kibnal_ctl_table[] = {
+        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
+         &kibnal_tunables.kib_io_timeout, sizeof (int),
+         0644, NULL, &proc_dointvec},
+        { 0 }
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+        {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
+        { 0 }
+};
+#endif
+
+#ifdef unused
+void
+print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+{
+        char name[32];
+
+        if (service == NULL) 
+        {
+                CWARN("tag       : %s\n"
+                      "status    : %d (NULL)\n", tag, rc);
+                return;
+        }
+        strncpy (name, service->ServiceName, sizeof(name)-1);
+        name[sizeof(name)-1] = 0;
+        
+        CWARN("tag       : %s\n"
+              "status    : %d\n"
+              "service id: "LPX64"\n"
+              "name      : %s\n"
+              "NID       : "LPX64"\n", tag, rc,
+              service->RID.ServiceID, name,
+              *kibnal_service_nid_field(service));
+}
+#endif
+
+static void
+kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
+                              FSTATUS frc, uint32 madrc)
+{
+        *(FSTATUS *)arg = frc;
+        up (&kibnal_data.kib_nid_signal);
+}
+
+#if IBNAL_CHECK_ADVERT
+static void
+kibnal_service_query_done (void *arg, QUERY *qry, 
+                           QUERY_RESULT_VALUES *qry_result)
+{
+        FSTATUS frc = qry_result->Status;
+
+        if (frc != FSUCCESS &&
+            qry_result->ResultDataSize == 0)
+                frc = FERROR;
+        
+        *(FSTATUS *)arg = frc;
+        up (&kibnal_data.kib_nid_signal);
+}
+
+static void
+kibnal_check_advert (void)
+{
+        QUERY                  *qry;
+        IB_SERVICE_RECORD      *svc;
+        FSTATUS                 frc;
+        FSTATUS                 frc2;
+
+        PORTAL_ALLOC(qry, sizeof(*qry));
+        if (qry == NULL)
+                return;
+
+        memset (qry, 0, sizeof(*qry));
+        qry->InputType = InputTypeServiceRecord;
+        qry->OutputType = OutputTypeServiceRecord;
+        qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+        svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+                                                    kibnal_data.kib_port_guid,
+                                                    qry,
+                                                    kibnal_service_query_done,
+                                                    NULL, &frc2);
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("Immediate error %d checking SM service\n", frc);
+        } else {
+                down (&kibnal_data.kib_nid_signal);
+                frc = frc2;
+
+                if (frc != 0)
+                        CERROR ("Error %d checking SM service\n", rc);
+        }
+
+        return (rc);
+}
+#endif
+
+static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
+{
+        IB_SERVICE_RECORD     *svc;
+
+        memset (fod, 0, sizeof(*fod));
+        fod->Type = type;
+
+        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+        svc->RID.ServiceID = kibnal_data.kib_service_id;
+        svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
+        svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
+        svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
+        svc->ServiceLease = 0xffffffff;
+
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+}
+
+static int
+kibnal_advertise (void)
+{
+        FABRIC_OPERATION_DATA *fod;
+        IB_SERVICE_RECORD     *svc;
+        FSTATUS                frc;
+        FSTATUS                frc2;
+
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC(fod, sizeof(*fod));
+        if (fod == NULL)
+                return (-ENOMEM);
+
+        fill_fod(fod, FabOpSetServiceRecord);
+        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+        CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
+               svc->RID.ServiceID, 
+               svc->ServiceName, *kibnal_service_nid_field(svc));
+
+        frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+                                            kibnal_data.kib_port_guid,
+                                            fod, kibnal_service_setunset_done, 
+                                            NULL, &frc2);
+
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("Immediate error %d advertising NID "LPX64"\n",
+                        frc, kibnal_data.kib_nid);
+                goto out;
+        }
+
+        down (&kibnal_data.kib_nid_signal);
+
+        frc = frc2;
+        if (frc != FSUCCESS)
+                CERROR ("Error %d advertising BUD "LPX64"\n",
+                        frc, kibnal_data.kib_nid);
+out:
+        PORTAL_FREE(fod, sizeof(*fod));
+        return (frc == FSUCCESS) ? 0 : -EINVAL;
+}
+
+static void
+kibnal_unadvertise (int expect_success)
+{
+        FABRIC_OPERATION_DATA *fod;
+        IB_SERVICE_RECORD     *svc;
+        FSTATUS                frc;
+        FSTATUS                frc2;
+
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC(fod, sizeof(*fod));
+        if (fod == NULL)
+                return;
+
+        fill_fod(fod, FabOpDeleteServiceRecord);
+        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+        CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
+               svc->ServiceName, *kibnal_service_nid_field(svc));
+        
+        frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+                                            kibnal_data.kib_port_guid,
+                                            fod, kibnal_service_setunset_done, 
+                                            NULL, &frc2);
+
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
+                        frc, kibnal_data.kib_nid);
+                goto out;
+        }
+
+        down (&kibnal_data.kib_nid_signal);
+
+        if ((frc2 == FSUCCESS) == !!expect_success)
+                goto out;
+
+        if (expect_success)
+                CERROR("Error %d unadvertising NID "LPX64"\n",
+                       frc2, kibnal_data.kib_nid);
+        else
+                CWARN("Removed conflicting NID "LPX64"\n",
+                      kibnal_data.kib_nid);
+ out:
+        PORTAL_FREE(fod, sizeof(*fod));
+}
+
+static int
+kibnal_set_mynid(ptl_nid_t nid)
+{
+        struct timeval tv;
+        lib_ni_t      *ni = &kibnal_lib.libnal_ni;
+        int            rc;
+        FSTATUS        frc;
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+               nid, ni->ni_pid.nid);
+
+        do_gettimeofday(&tv);
+
+        down (&kibnal_data.kib_nid_mutex);
+
+        if (nid == kibnal_data.kib_nid) {
+                /* no change of NID */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+
+        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
+               kibnal_data.kib_nid, nid);
+        
+        if (kibnal_data.kib_nid != PTL_NID_ANY) {
+
+                kibnal_unadvertise (1);
+
+                frc = iibt_cm_cancel(kibnal_data.kib_cep);
+                if (frc != FSUCCESS && frc != FPENDING)
+                        CERROR ("Error %d stopping listener\n", frc);
+
+                frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
+                if (frc != FSUCCESS)
+                        CERROR ("Error %d destroying CEP\n", frc);
+
+                kibnal_data.kib_cep = NULL;
+        }
+        
+        kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+        
+        /* Delete all existing peers and their connections after new
+         * NID/incarnation set to ensure no old connections in our brave
+         * new world. */
+        kibnal_del_peer (PTL_NID_ANY, 0);
+
+        if (kibnal_data.kib_nid == PTL_NID_ANY) {
+                /* No new NID to install */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+
+        /* remove any previous advert (crashed node etc) */
+        kibnal_unadvertise(0);
+
+        kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
+        if (kibnal_data.kib_cep == NULL) {
+                CERROR ("Can't create CEP\n");
+                rc = -ENOMEM;
+        } else {
+                CM_LISTEN_INFO info;
+                memset (&info, 0, sizeof(info));
+                info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
+
+                frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
+                                     kibnal_listen_callback, NULL);
+                if (frc != FSUCCESS && frc != FPENDING) {
+                        CERROR ("iibt_cm_listen error: %d\n", frc);
+                        rc = -EINVAL;
+                } else {
+                        rc = 0;
+                }
+        }
+        
+        if (rc == 0) {
+                rc = kibnal_advertise();
+                if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+                        kibnal_check_advert();
+#endif
+                        up (&kibnal_data.kib_nid_mutex);
+                        return (0);
+                }
+                
+                iibt_cm_cancel (kibnal_data.kib_cep);
+                iibt_cm_destroy_cep (kibnal_data.kib_cep);
+                /* remove any peers that sprung up while I failed to
+                 * advertise myself */
+                kibnal_del_peer (PTL_NID_ANY, 0);
+        }
+
+        kibnal_data.kib_nid = PTL_NID_ANY;
+        up (&kibnal_data.kib_nid_mutex);
+        return (rc);
+}
+
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
+{
+        kib_peer_t *peer;
+
+        LASSERT (nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC (peer, sizeof (*peer));
+        if (peer == NULL)
+                return (NULL);
+
+        memset(peer, 0, sizeof(*peer));         /* zero flags etc */
+
+        peer->ibp_nid = nid;
+        atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
+
+        INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
+        INIT_LIST_HEAD (&peer->ibp_conns);
+        INIT_LIST_HEAD (&peer->ibp_tx_queue);
+
+        peer->ibp_reconnect_time = jiffies;
+        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+        atomic_inc (&kibnal_data.kib_npeers);
+        return (peer);
+}
+
+void
+kibnal_destroy_peer (kib_peer_t *peer)
+{
+
+        LASSERT (atomic_read (&peer->ibp_refcount) == 0);
+        LASSERT (peer->ibp_persistence == 0);
+        LASSERT (!kibnal_peer_active(peer));
+        LASSERT (peer->ibp_connecting == 0);
+        LASSERT (list_empty (&peer->ibp_conns));
+        LASSERT (list_empty (&peer->ibp_tx_queue));
+
+        PORTAL_FREE (peer, sizeof (*peer));
+
+        /* NB a peer's connections keep a reference on their peer until
+         * they are destroyed, so we can be assured that _all_ state to do
+         * with this peer has been cleaned up when its refcount drops to
+         * zero. */
+        atomic_dec (&kibnal_data.kib_npeers);
+}
+
+/* the caller is responsible for accounting for the additional reference
+ * that this creates */
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
+{
+        struct list_head *peer_list = kibnal_nid2peerlist (nid);
+        struct list_head *tmp;
+        kib_peer_t       *peer;
+
+        list_for_each (tmp, peer_list) {
+
+                peer = list_entry (tmp, kib_peer_t, ibp_list);
+
+                LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
+                         peer->ibp_connecting != 0 || /* creating conns */
+                         !list_empty (&peer->ibp_conns));  /* active conn */
+
+                if (peer->ibp_nid != nid)
+                        continue;
+
+                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
+                       peer, nid, atomic_read (&peer->ibp_refcount));
+                return (peer);
+        }
+        return (NULL);
+}
+
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
+{
+        kib_peer_t     *peer;
+
+        read_lock (&kibnal_data.kib_global_lock);
+        peer = kibnal_find_peer_locked (nid);
+        if (peer != NULL)                       /* +1 ref for caller? */
+                kib_peer_addref(peer);
+        read_unlock (&kibnal_data.kib_global_lock);
+
+        return (peer);
+}
+
+void
+kibnal_unlink_peer_locked (kib_peer_t *peer)
+{
+        LASSERT (peer->ibp_persistence == 0);
+        LASSERT (list_empty(&peer->ibp_conns));
+
+        LASSERT (kibnal_peer_active(peer));
+        list_del_init (&peer->ibp_list);
+        /* lose peerlist's ref */
+        kib_peer_decref(peer);
+}
+
+static int
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+{
+        kib_peer_t        *peer;
+        struct list_head  *ptmp;
+        int                i;
+
+        read_lock (&kibnal_data.kib_global_lock);
+
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (index-- > 0)
+                                continue;
+
+                        *nidp = peer->ibp_nid;
+                        *persistencep = peer->ibp_persistence;
+
+                        read_unlock (&kibnal_data.kib_global_lock);
+                        return (0);
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+        return (-ENOENT);
+}
+
+static int
+kibnal_add_persistent_peer (ptl_nid_t nid)
+{
+        unsigned long      flags;
+        kib_peer_t        *peer;
+        kib_peer_t        *peer2;
+        
+        if (nid == PTL_NID_ANY)
+                return (-EINVAL);
+
+        peer = kibnal_create_peer (nid);
+        if (peer == NULL)
+                return (-ENOMEM);
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        peer2 = kibnal_find_peer_locked (nid);
+        if (peer2 != NULL) {
+                kib_peer_decref (peer);
+                peer = peer2;
+        } else {
+                /* peer table takes existing ref on peer */
+                list_add_tail (&peer->ibp_list,
+                               kibnal_nid2peerlist (nid));
+        }
+
+        peer->ibp_persistence++;
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        return (0);
+}
+
+static void
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+{
+        struct list_head *ctmp;
+        struct list_head *cnxt;
+        kib_conn_t       *conn;
+
+        if (!single_share)
+                peer->ibp_persistence = 0;
+        else if (peer->ibp_persistence > 0)
+                peer->ibp_persistence--;
+
+        if (peer->ibp_persistence != 0)
+                return;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                kibnal_close_conn_locked (conn, 0);
+        }
+
+        /* NB peer unlinks itself when last conn is closed */
+}
+
+int
+kibnal_del_peer (ptl_nid_t nid, int single_share)
+{
+        unsigned long      flags;
+        struct list_head  *ptmp;
+        struct list_head  *pnxt;
+        kib_peer_t        *peer;
+        int                lo;
+        int                hi;
+        int                i;
+        int                rc = -ENOENT;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        if (nid != PTL_NID_ANY)
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+        else {
+                lo = 0;
+                hi = kibnal_data.kib_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+                                continue;
+
+                        kibnal_del_peer_locked (peer, single_share);
+                        rc = 0;         /* matched something */
+
+                        if (single_share)
+                                goto out;
+                }
+        }
+ out:
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        return (rc);
+}
+
+static kib_conn_t *
+kibnal_get_conn_by_idx (int index)
+{
+        kib_peer_t        *peer;
+        struct list_head  *ptmp;
+        kib_conn_t        *conn;
+        struct list_head  *ctmp;
+        int                i;
+
+        read_lock (&kibnal_data.kib_global_lock);
+
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence > 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        list_for_each (ctmp, &peer->ibp_conns) {
+                                if (index-- > 0)
+                                        continue;
+
+                                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+                                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                                       atomic_read (&conn->ibc_refcount));
+                                atomic_inc (&conn->ibc_refcount);
+                                read_unlock (&kibnal_data.kib_global_lock);
+                                return (conn);
+                        }
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+        return (NULL);
+}
+
+kib_conn_t *
+kibnal_create_conn (void)
+{
+        kib_conn_t  *conn;
+        int          i;
+        __u64        vaddr = 0;
+        __u64        vaddr_base;
+        int          page_offset;
+        int          ipage;
+        int          rc;
+        FSTATUS      frc;
+        union {
+                IB_QP_ATTRIBUTES_CREATE    qp_create;
+                IB_QP_ATTRIBUTES_MODIFY    qp_attr;
+        } params;
+        
+        PORTAL_ALLOC (conn, sizeof (*conn));
+        if (conn == NULL) {
+                CERROR ("Can't allocate connection\n");
+                return (NULL);
+        }
+
+        /* zero flags, NULL pointers etc... */
+        memset (conn, 0, sizeof (*conn));
+
+        INIT_LIST_HEAD (&conn->ibc_tx_queue);
+        INIT_LIST_HEAD (&conn->ibc_active_txs);
+        spin_lock_init (&conn->ibc_lock);
+        
+        atomic_inc (&kibnal_data.kib_nconns);
+        /* well not really, but I call destroy() on failure, which decrements */
+
+        PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+        if (conn->ibc_rxs == NULL)
+                goto failed;
+        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+        rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
+        if (rc != 0)
+                goto failed;
+
+        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
+
+        for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+                struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+                kib_rx_t   *rx = &conn->ibc_rxs[i];
+
+                rx->rx_conn = conn;
+                rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
+                             page_offset);
+
+                if (kibnal_whole_mem()) 
+                        rx->rx_vaddr = kibnal_page2phys(page) + 
+                                       page_offset + 
+                                       kibnal_data.kib_md.md_addr;
+                else
+                        rx->rx_vaddr = vaddr;
+                
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
+                
+                page_offset += IBNAL_MSG_SIZE;
+                LASSERT (page_offset <= PAGE_SIZE);
+
+                if (page_offset == PAGE_SIZE) {
+                        page_offset = 0;
+                        ipage++;
+                        LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
+                }
+        }
+
+        params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
+                .Type                    = QPTypeReliableConnected,
+                .SendQDepth              = IBNAL_TX_MAX_SG * 
+                                           IBNAL_MSG_QUEUE_SIZE,
+                .RecvQDepth              = IBNAL_MSG_QUEUE_SIZE,
+                .SendDSListDepth         = 1,
+                .RecvDSListDepth         = 1,
+                .SendCQHandle            = kibnal_data.kib_cq,
+                .RecvCQHandle            = kibnal_data.kib_cq,
+                .PDHandle                = kibnal_data.kib_pd,
+                .SendSignaledCompletions = TRUE,
+        };
+        frc = iibt_qp_create(kibnal_data.kib_hca, &params.qp_create, NULL,
+                             &conn->ibc_qp, &conn->ibc_qp_attrs);
+        if (rc != 0) {
+                CERROR ("Failed to create queue pair: %d\n", rc);
+                goto failed;
+        }
+
+        /* Mark QP created */
+        conn->ibc_state = IBNAL_CONN_INIT_QP;
+
+        params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState             = QPStateInit,
+                .Attrs                    = (IB_QP_ATTR_PORTGUID |
+                                             IB_QP_ATTR_PKEYINDEX |
+                                             IB_QP_ATTR_ACCESSCONTROL),
+                .PortGUID                 = kibnal_data.kib_port_guid,
+                .PkeyIndex                = 0,
+                .AccessControl = {
+                        .s = {
+                                .RdmaWrite = 1,
+                                .RdmaRead  = 1,
+                        },
+                },
+        };
+        rc = iibt_qp_modify(conn->ibc_qp, &params.qp_attr, NULL);
+        if (rc != 0) {
+                CERROR ("Failed to modify queue pair: %d\n", rc);
+                goto failed;
+        }
+
+        /* 1 ref for caller */
+        atomic_set (&conn->ibc_refcount, 1);
+        return (conn);
+        
+ failed:
+        kibnal_destroy_conn (conn);
+        return (NULL);
+}
+
+void
+kibnal_destroy_conn (kib_conn_t *conn)
+{
+        int    rc;
+        FSTATUS frc;
+        
+        CDEBUG (D_NET, "connection %p\n", conn);
+
+        LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+        LASSERT (list_empty(&conn->ibc_tx_queue));
+        LASSERT (list_empty(&conn->ibc_active_txs));
+        LASSERT (conn->ibc_nsends_posted == 0);
+        LASSERT (conn->ibc_connreq == NULL);
+
+        switch (conn->ibc_state) {
+        case IBNAL_CONN_DISCONNECTED:
+                /* called after connection sequence initiated */
+                /* fall through */
+
+        case IBNAL_CONN_INIT_QP:
+                /* _destroy includes an implicit Reset of the QP which 
+                 * discards posted work */
+                rc = iibt_qp_destroy(conn->ibc_qp);
+                if (rc != 0)
+                        CERROR("Can't destroy QP: %d\n", rc);
+                /* fall through */
+                
+        case IBNAL_CONN_INIT_NOTHING:
+                break;
+
+        default:
+                LASSERT (0);
+        }
+
+        if (conn->ibc_cep != NULL) {
+                frc = iibt_cm_destroy_cep(conn->ibc_cep);
+                if (frc != 0)
+                        CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, 
+                               frc);
+        }
+
+        if (conn->ibc_rx_pages != NULL) 
+                kibnal_free_pages(conn->ibc_rx_pages);
+        
+        if (conn->ibc_rxs != NULL)
+                PORTAL_FREE(conn->ibc_rxs, 
+                            IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+        if (conn->ibc_peer != NULL)
+                kib_peer_decref(conn->ibc_peer);
+
+        PORTAL_FREE(conn, sizeof (*conn));
+
+        atomic_dec(&kibnal_data.kib_nconns);
+        
+        if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+            kibnal_data.kib_shutdown) {
+                /* I just nuked the last connection on shutdown; wake up
+                 * everyone so they can exit. */
+                wake_up_all(&kibnal_data.kib_sched_waitq);
+                wake_up_all(&kibnal_data.kib_connd_waitq);
+        }
+}
+
+void
+kibnal_put_conn (kib_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
+                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                atomic_read (&conn->ibc_refcount));
+
+        LASSERT (atomic_read (&conn->ibc_refcount) > 0);
+        if (!atomic_dec_and_test (&conn->ibc_refcount))
+                return;
+
+        /* must disconnect before dropping the final ref */
+        LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
+
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+        list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+}
+
+static int
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+        kib_conn_t         *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                count++;
+                kibnal_close_conn_locked (conn, why);
+        }
+
+        return (count);
+}
+
+int
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
+{
+        kib_conn_t         *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                if (conn->ibc_incarnation == incarnation)
+                        continue;
+
+                CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
+                       peer->ibp_nid, conn->ibc_incarnation, incarnation);
+                
+                count++;
+                kibnal_close_conn_locked (conn, -ESTALE);
+        }
+
+        return (count);
+}
+
+static int
+kibnal_close_matching_conns (ptl_nid_t nid)
+{
+        unsigned long       flags;
+        kib_peer_t         *peer;
+        struct list_head   *ptmp;
+        struct list_head   *pnxt;
+        int                 lo;
+        int                 hi;
+        int                 i;
+        int                 count = 0;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        if (nid != PTL_NID_ANY)
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+        else {
+                lo = 0;
+                hi = kibnal_data.kib_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+                                continue;
+
+                        count += kibnal_close_peer_conns_locked (peer, 0);
+                }
+        }
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        /* wildcards always succeed */
+        if (nid == PTL_NID_ANY)
+                return (0);
+        
+        return (count == 0 ? -ENOENT : 0);
+}
+
+static int
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
+{
+        int rc = -EINVAL;
+        ENTRY;
+
+        LASSERT (pcfg != NULL);
+
+        switch(pcfg->pcfg_command) {
+        case NAL_CMD_GET_PEER: {
+                ptl_nid_t   nid = 0;
+                int         share_count = 0;
+
+                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                                          &nid, &share_count);
+                pcfg->pcfg_nid   = nid;
+                pcfg->pcfg_size  = 0;
+                pcfg->pcfg_id    = 0;
+                pcfg->pcfg_misc  = 0;
+                pcfg->pcfg_count = 0;
+                pcfg->pcfg_wait  = share_count;
+                break;
+        }
+        case NAL_CMD_ADD_PEER: {
+                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
+                break;
+        }
+        case NAL_CMD_DEL_PEER: {
+                rc = kibnal_del_peer (pcfg->pcfg_nid, 
+                                       /* flags == single_share */
+                                       pcfg->pcfg_flags != 0);
+                break;
+        }
+        case NAL_CMD_GET_CONN: {
+                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+
+                if (conn == NULL)
+                        rc = -ENOENT;
+                else {
+                        rc = 0;
+                        pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
+                        pcfg->pcfg_id    = 0;
+                        pcfg->pcfg_misc  = 0;
+                        pcfg->pcfg_flags = 0;
+                        kibnal_put_conn (conn);
+                }
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                if (pcfg->pcfg_nid == PTL_NID_ANY)
+                        rc = -EINVAL;
+                else
+                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
+                break;
+        }
+        }
+
+        RETURN(rc);
+}
+
+void
+kibnal_free_pages (kib_pages_t *p)
+{
+        int     npages = p->ibp_npages;
+        int     rc;
+        int     i;
+        
+        if (p->ibp_mapped) {
+                rc = iibt_deregister_memory(p->ibp_handle);
+                if (rc != 0)
+                        CERROR ("Deregister error: %d\n", rc);
+        }
+        
+        for (i = 0; i < npages; i++)
+                if (p->ibp_pages[i] != NULL)
+                        __free_page(p->ibp_pages[i]);
+        
+        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
+{
+        kib_pages_t                *p;
+        __u64                      *phys_pages;
+        int                         i;
+        FSTATUS                     frc;
+        IB_ACCESS_CONTROL           access;
+
+        memset(&access, 0, sizeof(access));
+        access.s.MWBindable = 1;
+        access.s.LocalWrite = 1;
+        access.s.RdmaRead = 1;
+        access.s.RdmaWrite = 1;
+
+        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+        if (p == NULL) {
+                CERROR ("Can't allocate buffer %d\n", npages);
+                return (-ENOMEM);
+        }
+
+        memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+        p->ibp_npages = npages;
+        
+        for (i = 0; i < npages; i++) {
+                p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+                if (p->ibp_pages[i] == NULL) {
+                        CERROR ("Can't allocate page %d of %d\n", i, npages);
+                        kibnal_free_pages(p);
+                        return (-ENOMEM);
+                }
+        }
+
+        if (kibnal_whole_mem())
+                goto out;
+
+        PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
+        if (phys_pages == NULL) {
+                CERROR ("Can't allocate physarray for %d pages\n", npages);
+                /* XXX free ibp_pages? */
+                kibnal_free_pages(p);
+                return (-ENOMEM);
+        }
+
+        /* if we were using the _contig_ registration variant we would have
+         * an array of PhysAddr/Length pairs, but the discontiguous variant
+         * just takes the PhysAddr */
+        for (i = 0; i < npages; i++)
+                phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
+
+        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+                                            0,          /* requested vaddr */
+                                            phys_pages, npages,
+                                            0,          /* offset */
+                                            kibnal_data.kib_pd,
+                                            access,
+                                            &p->ibp_handle, &p->ibp_vaddr,
+                                            &p->ibp_lkey, &p->ibp_rkey);
+        
+        PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
+        
+        if (frc != FSUCCESS) {
+                CERROR ("Error %d mapping %d pages\n", frc, npages);
+                kibnal_free_pages(p);
+                return (-ENOMEM);
+        }
+
+        CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
+                      "lkey %x rkey %x\n", npages, p->ibp_handle,
+                      p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
+        
+        p->ibp_mapped = 1;
+out:
+        *pp = p;
+        return (0);
+}
+
+static int
+kibnal_setup_tx_descs (void)
+{
+        int           ipage = 0;
+        int           page_offset = 0;
+        __u64         vaddr;
+        __u64         vaddr_base;
+        struct page  *page;
+        kib_tx_t     *tx;
+        int           i;
+        int           rc;
+
+        /* pre-mapped messages are not bigger than 1 page */
+        LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+
+        /* No fancy arithmetic when we do the buffer calculations */
+        LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+
+        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
+                                0);
+        if (rc != 0)
+                return (rc);
+
+        /* ignored for the whole_mem case */
+        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
+
+        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+                page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+                tx = &kibnal_data.kib_tx_descs[i];
+
+                memset (tx, 0, sizeof(*tx));    /* zero flags etc */
+                
+                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
+                                            page_offset);
+
+                if (kibnal_whole_mem()) 
+                        tx->tx_vaddr = kibnal_page2phys(page) + 
+                                       page_offset + 
+                                       kibnal_data.kib_md.md_addr;
+                else
+                        tx->tx_vaddr = vaddr;
+
+                tx->tx_isnblk = (i >= IBNAL_NTX);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+
+                CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
+                       i, tx, tx->tx_msg, tx->tx_vaddr);
+
+                if (tx->tx_isnblk)
+                        list_add (&tx->tx_list, 
+                                  &kibnal_data.kib_idle_nblk_txs);
+                else
+                        list_add (&tx->tx_list, 
+                                  &kibnal_data.kib_idle_txs);
+
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+
+                page_offset += IBNAL_MSG_SIZE;
+                LASSERT (page_offset <= PAGE_SIZE);
+
+                if (page_offset == PAGE_SIZE) {
+                        page_offset = 0;
+                        ipage++;
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+                }
+        }
+        
+        return (0);
+}
+
+static void
+kibnal_api_shutdown (nal_t *nal)
+{
+        int   i;
+        int   rc;
+
+        if (nal->nal_refct != 0) {
+                /* This module got the first ref */
+                PORTAL_MODULE_UNUSE;
+                return;
+        }
+
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        LASSERT(nal == &kibnal_api);
+
+        switch (kibnal_data.kib_init) {
+        default:
+                CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
+                LBUG();
+
+        case IBNAL_INIT_ALL:
+                /* stop calls to nal_cmd */
+                libcfs_nal_cmd_unregister(IIBNAL);
+                /* No new peers */
+
+                /* resetting my NID to unadvertises me, removes my
+                 * listener and nukes all current peers */
+                kibnal_set_mynid (PTL_NID_ANY);
+
+                /* Wait for all peer state to clean up (crazy) */
+                i = 2;
+                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "waiting for %d peers to disconnect (can take a few seconds)\n",
+                               atomic_read (&kibnal_data.kib_npeers));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+                /* fall through */
+
+        case IBNAL_INIT_CQ:
+                rc = iibt_cq_destroy(kibnal_data.kib_cq);
+                if (rc != 0)
+                        CERROR ("Destroy CQ error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_TXD:
+                kibnal_free_pages (kibnal_data.kib_tx_pages);
+                /* fall through */
+
+        case IBNAL_INIT_MR:
+                if (kibnal_data.kib_md.md_handle != NULL) {
+                        rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
+                        if (rc != FSUCCESS)
+                                CERROR ("Deregister memory: %d\n", rc);
+                }
+                /* fall through */
+
+#if IBNAL_FMR
+        case IBNAL_INIT_FMR:
+                rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
+                if (rc != 0)
+                        CERROR ("Destroy FMR pool error: %d\n", rc);
+                /* fall through */
+#endif
+        case IBNAL_INIT_PD:
+                rc = iibt_pd_free(kibnal_data.kib_pd);
+                if (rc != 0)
+                        CERROR ("Destroy PD error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_SD:
+                rc = iibt_sd_deregister(kibnal_data.kib_sd);
+                if (rc != 0)
+                        CERROR ("Deregister SD error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_PORT:
+                /* XXX ??? */
+                /* fall through */
+
+        case IBNAL_INIT_PORTATTRS:
+                PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
+                            kibnal_data.kib_hca_attrs.PortAttributesListSize);
+                /* fall through */
+
+        case IBNAL_INIT_HCA:
+                rc = iibt_close_hca(kibnal_data.kib_hca);
+                if (rc != 0)
+                        CERROR ("Close HCA  error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_LIB:
+                lib_fini(&kibnal_lib);
+                /* fall through */
+
+        case IBNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all peers
+                 * have been closed so all lists must be empty */
+                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (kibnal_data.kib_peers != NULL);
+                for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                        LASSERT (list_empty (&kibnal_data.kib_peers[i]));
+                }
+                LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+                LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+                LASSERT (list_empty (&kibnal_data.kib_connd_peers));
+
+                /* flag threads to terminate; wake and wait for them to die */
+                kibnal_data.kib_shutdown = 1;
+                wake_up_all (&kibnal_data.kib_sched_waitq);
+                wake_up_all (&kibnal_data.kib_connd_waitq);
+
+                i = 2;
+                while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "Waiting for %d threads to terminate\n",
+                               atomic_read (&kibnal_data.kib_nthreads));
+                        set_current_state (TASK_INTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+                /* fall through */
+                
+        case IBNAL_INIT_NOTHING:
+                break;
+        }
+
+        if (kibnal_data.kib_tx_descs != NULL)
+                PORTAL_FREE (kibnal_data.kib_tx_descs,
+                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
+
+        if (kibnal_data.kib_peers != NULL)
+                PORTAL_FREE (kibnal_data.kib_peers,
+                             sizeof (struct list_head) * 
+                             kibnal_data.kib_peer_hash_size);
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+        printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+
+        kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+}
+
+#define roundup_power(val, power) \
+        ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
+
+/* this isn't very portable or sturdy in the face of funny mem/bus configs */
+static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
+{
+        struct sysinfo si;
+        __u64 ret;
+
+        /* XXX we don't bother with first-gen cards */
+        if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
+                return 0ULL;
+
+        si_meminfo(&si);
+        ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
+        return roundup_power(ret, 128 * 1024 * 1024);
+} 
+#undef roundup_power
+
+static int
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+                     ptl_ni_limits_t *requested_limits,
+                     ptl_ni_limits_t *actual_limits)
+{
+        ptl_process_id_t    process_id;
+        int                 pkmem = atomic_read(&portal_kmemory);
+        IB_PORT_ATTRIBUTES *pattr;
+        FSTATUS             frc;
+        int                 rc;
+        int                 n;
+        int                 i;
+
+        LASSERT (nal == &kibnal_api);
+
+        if (nal->nal_refct != 0) {
+                if (actual_limits != NULL)
+                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
+                /* This module got the first ref */
+                PORTAL_MODULE_USE;
+                return (PTL_OK);
+        }
+
+        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+
+        frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, 
+                                       &kibnal_data.kib_interfaces);
+        if (frc != FSUCCESS) {
+                CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
+                        frc);
+                return -ENOSYS;
+        }
+
+        init_MUTEX (&kibnal_data.kib_nid_mutex);
+        init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+        kibnal_data.kib_nid = PTL_NID_ANY;
+
+        rwlock_init(&kibnal_data.kib_global_lock);
+
+        kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+        PORTAL_ALLOC (kibnal_data.kib_peers,
+                      sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+        if (kibnal_data.kib_peers == NULL) {
+                goto failed;
+        }
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+                INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+        spin_lock_init (&kibnal_data.kib_connd_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+        init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+        spin_lock_init (&kibnal_data.kib_sched_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+        init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+        spin_lock_init (&kibnal_data.kib_tx_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        if (kibnal_data.kib_tx_descs == NULL) {
+                CERROR ("Can't allocate tx descs\n");
+                goto failed;
+        }
+
+        /* lists/ptrs/locks initialised */
+        kibnal_data.kib_init = IBNAL_INIT_DATA;
+        /*****************************************************/
+
+        process_id.pid = 0;
+        process_id.nid = kibnal_data.kib_nid;
+        
+        rc = lib_init(&kibnal_lib, nal, process_id,
+                      requested_limits, actual_limits);
+        if (rc != PTL_OK) {
+                CERROR("lib_init failed: error %d\n", rc);
+                goto failed;
+        }
+
+        /* lib interface initialised */
+        kibnal_data.kib_init = IBNAL_INIT_LIB;
+        /*****************************************************/
+
+        for (i = 0; i < IBNAL_N_SCHED; i++) {
+                rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
+                if (rc != 0) {
+                        CERROR("Can't spawn iibnal scheduler[%d]: %d\n",
+                               i, rc);
+                        goto failed;
+                }
+        }
+
+        rc = kibnal_thread_start (kibnal_connd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't spawn iibnal connd: %d\n", rc);
+                goto failed;
+        }
+
+        n = sizeof(kibnal_data.kib_hca_guids) /
+            sizeof(kibnal_data.kib_hca_guids[0]);
+        frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't get channel adapter guids: %d\n", frc);
+                goto failed;
+        }
+        if (n == 0) {
+                CERROR ("No channel adapters found\n");
+                goto failed;
+        }
+
+        /* Infinicon has per-HCA rather than per CQ completion handlers */
+        frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
+                            kibnal_ca_callback,
+                            kibnal_ca_async_callback,
+                            &kibnal_data.kib_hca,
+                            &kibnal_data.kib_hca);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't open CA[0]: %d\n", frc);
+                goto failed;
+        }
+        
+        /* Channel Adapter opened */
+        kibnal_data.kib_init = IBNAL_INIT_HCA;
+        /*****************************************************/
+
+        kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
+        kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
+        frc = iibt_query_hca(kibnal_data.kib_hca,
+                             &kibnal_data.kib_hca_attrs, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't size port attrs: %d\n", frc);
+                goto failed;
+        }
+        
+        PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
+                     kibnal_data.kib_hca_attrs.PortAttributesListSize);
+        if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
+                goto failed;
+
+        /* Port attrs allocated */
+        kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
+        /*****************************************************/
+        
+        frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
+                             NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't get port attrs for CA 0: %d\n", frc);
+                goto failed;
+        }
+
+        for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
+             pattr != NULL;
+             i++, pattr = pattr->Next) {
+                switch (pattr->PortState) {
+                default:
+                        CERROR("Unexpected port[%d] state %d\n",
+                               i, pattr->PortState);
+                        continue;
+                case PortStateDown:
+                        CDEBUG(D_NET, "port[%d] Down\n", i);
+                        continue;
+                case PortStateInit:
+                        CDEBUG(D_NET, "port[%d] Init\n", i);
+                        continue;
+                case PortStateArmed:
+                        CDEBUG(D_NET, "port[%d] Armed\n", i);
+                        continue;
+                        
+                case PortStateActive:
+                        CDEBUG(D_NET, "port[%d] Active\n", i);
+                        kibnal_data.kib_port = i;
+                        kibnal_data.kib_port_guid = pattr->GUID;
+                        kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
+                        break;
+                }
+                break;
+        }
+
+        if (pattr == NULL) {
+                CERROR ("Can't find an active port\n");
+                goto failed;
+        }
+
+        CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
+        
+        /* Active port found */
+        kibnal_data.kib_init = IBNAL_INIT_PORT;
+        /*****************************************************/
+
+        frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't register with SD: %d\n", frc);
+                goto failed;
+        }
+        
+        /* Registered with SD OK */
+        kibnal_data.kib_init = IBNAL_INIT_SD;
+        /*****************************************************/
+
+        frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't create PD: %d\n", rc);
+                goto failed;
+        }
+        
+        /* flag PD initialised */
+        kibnal_data.kib_init = IBNAL_INIT_PD;
+        /*****************************************************/
+
+#if IBNAL_FMR
+        {
+                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
+                struct ib_fmr_pool_param params = {
+                        .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
+                        .access            = (IB_ACCESS_LOCAL_WRITE |
+                                              IB_ACCESS_REMOTE_WRITE |
+                                              IB_ACCESS_REMOTE_READ),
+                        .pool_size         = pool_size,
+                        .dirty_watermark   = (pool_size * 3)/4,
+                        .flush_function    = NULL,
+                        .flush_arg         = NULL,
+                        .cache             = 1,
+                };
+                rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
+                                        &kibnal_data.kib_fmr_pool);
+                if (rc != 0) {
+                        CERROR ("Can't create FMR pool size %d: %d\n", 
+                                pool_size, rc);
+                        goto failed;
+                }
+        }
+
+        /* flag FMR pool initialised */
+        kibnal_data.kib_init = IBNAL_INIT_FMR;
+#endif
+        /*****************************************************/
+        if (IBNAL_WHOLE_MEM) {
+                IB_MR_PHYS_BUFFER phys;
+                IB_ACCESS_CONTROL access;
+                kib_md_t *md = &kibnal_data.kib_md;
+
+                memset(&access, 0, sizeof(access));
+                access.s.MWBindable = 1;
+                access.s.LocalWrite = 1;
+                access.s.RdmaRead = 1;
+                access.s.RdmaWrite = 1;
+
+                phys.PhysAddr = 0;
+                phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
+                if (phys.Length == 0) {
+                        CERROR ("couldn't determine the end of phys mem\n");
+                        goto failed;
+                }
+       
+                rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
+                                                          0,
+                                                          &phys, 1,
+                                                          0,
+                                                          kibnal_data.kib_pd,
+                                                          access,
+                                                          &md->md_handle,
+                                                          &md->md_addr,
+                                                          &md->md_lkey,
+                                                          &md->md_rkey);
+                if (rc != FSUCCESS) {
+                        CERROR("registering physical memory failed: %d\n", 
+                               rc);
+                        CERROR("falling back to registration per-rdma\n");
+                        md->md_handle = NULL;
+                } else {
+                        CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
+                               phys.Length);
+                        kibnal_data.kib_init = IBNAL_INIT_MR;
+                }
+        }
+
+        /*****************************************************/
+
+        rc = kibnal_setup_tx_descs();
+        if (rc != 0) {
+                CERROR ("Can't register tx descs: %d\n", rc);
+                goto failed;
+        }
+        
+        /* flag TX descs initialised */
+        kibnal_data.kib_init = IBNAL_INIT_TXD;
+        /*****************************************************/
+        
+        {
+                uint32 nentries;
+
+                frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
+                                     &kibnal_data.kib_cq, &kibnal_data.kib_cq,
+                                     &nentries);
+                if (frc != FSUCCESS) {
+                        CERROR ("Can't create RX CQ: %d\n", frc);
+                        goto failed;
+                }
+
+                /* flag CQ initialised */
+                kibnal_data.kib_init = IBNAL_INIT_CQ;
+
+                if (nentries < IBNAL_CQ_ENTRIES) {
+                        CERROR ("CQ only has %d entries, need %d\n", 
+                                nentries, IBNAL_CQ_ENTRIES);
+                        goto failed;
+                }
+
+                rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
+                if (rc != 0) {
+                        CERROR ("Failed to re-arm completion queue: %d\n", rc);
+                        goto failed;
+                }
+        }
+        
+        /*****************************************************/
+
+        rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                goto failed;
+        }
+
+        /* flag everything initialised */
+        kibnal_data.kib_init = IBNAL_INIT_ALL;
+        /*****************************************************/
+
+        printk(KERN_INFO "Lustre: Infinicon IB NAL loaded "
+               "(initial mem %d)\n", pkmem);
+
+        return (PTL_OK);
+
+ failed:
+        kibnal_api_shutdown (&kibnal_api);    
+        return (PTL_FAIL);
+}
+
+void __exit
+kibnal_module_fini (void)
+{
+#ifdef CONFIG_SYSCTL
+        if (kibnal_tunables.kib_sysctl != NULL)
+                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
+#endif
+        PtlNIFini(kibnal_ni);
+
+        ptl_unregister_nal(IIBNAL);
+}
+
+int __init
+kibnal_module_init (void)
+{
+        int    rc;
+
+        if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
+                CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
+                return -EINVAL;
+        }
+
+        /* the following must be sizeof(int) for proc_dointvec() */
+        if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
+                CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
+                return -EINVAL;
+        }
+
+        kibnal_api.nal_ni_init = kibnal_api_startup;
+        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
+
+        /* Initialise dynamic tunables to defaults once only */
+        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+
+        rc = ptl_register_nal(IIBNAL, &kibnal_api);
+        if (rc != PTL_OK) {
+                CERROR("Can't register IBNAL: %d\n", rc);
+                return (-ENOMEM);               /* or something... */
+        }
+
+        /* Pure gateways want the NAL started up at module load time... */
+        rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni);
+        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+                ptl_unregister_nal(IIBNAL);
+                return (-ENODEV);
+        }
+        
+#ifdef CONFIG_SYSCTL
+        /* Press on regardless even if registering sysctl doesn't work */
+        kibnal_tunables.kib_sysctl = 
+                register_sysctl_table (kibnal_top_ctl_table, 0);
+#endif
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
+
diff --git a/lustre/portals/knals/iibnal/iibnal.h b/lustre/portals/knals/iibnal/iibnal.h
new file mode 100644 (file)
index 0000000..0a25a9a
--- /dev/null
@@ -0,0 +1,892 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#define DEBUG_SUBSYSTEM S_IBNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/nal.h>
+
+#include <linux/iba/ibt.h>
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+                + __GNUC_MINOR__ * 100 \
+                + __GNUC_PATCHLEVEL__)
+
+/* Test for GCC > 3.2.2 */
+#if GCC_VERSION <= 30202
+/* GCC 3.2.2, and presumably several versions before it, will
+ * miscompile this driver. See
+ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */
+#error Invalid GCC version. Must use GCC >= 3.2.3
+#endif
+
+#define IBNAL_SERVICE_NAME   "iibnal"
+#define IBNAL_SERVICE_NUMBER 0x11b9a1
+
+#if CONFIG_SMP
+# define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
+#else
+# define IBNAL_N_SCHED      1                   /* # schedulers */
+#endif
+
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
+
+#define IBNAL_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
+
+#define IBNAL_MSG_QUEUE_SIZE   8                /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 7                /* when to eagerly return credits */
+/* 7 indicates infinite retry attempts, Infinicon recommended 5 */
+#define IBNAL_RETRY            5                /* # times to retry */
+#define IBNAL_RNR_RETRY        5                /*  */
+#define IBNAL_CM_RETRY         5                /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL     1
+#define IBNAL_ACK_TIMEOUT       20              /* supposedly 4 secs */
+
+#define IBNAL_NTX             64                /* # tx descs */
+/* this had to be dropped down so that we only register < 255 pages per
+ * region.  this will change if we register all memory. */
+#define IBNAL_NTX_NBLK        128               /* # reserved tx descs */
+
+#define IBNAL_PEER_HASH_SIZE  101               /* # peer lists */
+
+#define IBNAL_RESCHED         100               /* # scheduler loops before reschedule */
+
+#define IBNAL_CONCURRENT_PEERS 1000             /* # nodes all talking at once to me */
+
+/* default vals for runtime tunables */
+#define IBNAL_IO_TIMEOUT      50                /* default comms timeout (seconds) */
+
+/************************/
+/* derived constants... */
+
+/* TX messages (shared by all connections) */
+#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1)
+
+/* RX messages (per connection) */
+#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+
+/* we may have up to 2 completions per transmit +
+   1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
+                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
+
+#define IBNAL_RDMA_BASE  0x0eeb0000
+#define IBNAL_FMR        0
+#define IBNAL_WHOLE_MEM  1
+#define IBNAL_CKSUM      0
+//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
+
+/* XXX I have no idea. */
+#define IBNAL_STARTING_PSN 1
+
+typedef struct 
+{
+        int               kib_io_timeout;       /* comms timeout (seconds) */
+        struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+} kib_tunables_t;
+
+/* some of these have specific types in the stack that just map back
+ * to the uFOO types, like IB_{L,R}_KEY. */
+typedef struct
+{
+        int               ibp_npages;           /* # pages */
+        int               ibp_mapped;           /* mapped? */
+        __u64             ibp_vaddr;            /* mapped region vaddr */
+        __u32             ibp_lkey;             /* mapped region lkey */
+        __u32             ibp_rkey;             /* mapped region rkey */
+        IB_HANDLE         ibp_handle;           /* mapped region handle */
+        struct page      *ibp_pages[0];
+} kib_pages_t;
+
+typedef struct
+{
+        IB_HANDLE         md_handle;
+        __u32             md_lkey;
+        __u32             md_rkey;
+        __u64             md_addr;
+} kib_md_t __attribute__((packed));
+        
+typedef struct 
+{
+        int               kib_init;             /* initialisation state */
+        __u64             kib_incarnation;      /* which one am I */
+        int               kib_shutdown;         /* shut down? */
+        atomic_t          kib_nthreads;         /* # live threads */
+
+        __u64             kib_service_id;       /* service number I listen on */
+        __u64             kib_port_guid;        /* my GUID (lo 64 of GID)*/
+        __u16             kib_port_pkey;        /* my pkey, whatever that is */
+        ptl_nid_t         kib_nid;              /* my NID */
+        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
+        struct semaphore  kib_nid_signal;       /* signal completion */
+        IB_HANDLE         kib_cep;              /* connection end point */
+
+        rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
+
+        struct list_head *kib_peers;            /* hash table of all my known peers */
+        int               kib_peer_hash_size;   /* size of kib_peers */
+        atomic_t          kib_npeers;           /* # peers extant */
+        atomic_t          kib_nconns;           /* # connections extant */
+
+        struct list_head  kib_connd_conns;      /* connections to progress */
+        struct list_head  kib_connd_peers;      /* peers waiting for a connection */
+        wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
+        unsigned long     kib_connd_waketime;   /* when connd will wake */
+        spinlock_t        kib_connd_lock;       /* serialise */
+
+        wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
+        struct list_head  kib_sched_txq;        /* tx requiring attention */
+        struct list_head  kib_sched_rxq;        /* rx requiring attention */
+        spinlock_t        kib_sched_lock;       /* serialise */
+        
+        struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
+        kib_pages_t      *kib_tx_pages;         /* premapped tx msg pages */
+
+        struct list_head  kib_idle_txs;         /* idle tx descriptors */
+        struct list_head  kib_idle_nblk_txs;    /* idle reserved tx descriptors */
+        wait_queue_head_t kib_idle_tx_waitq;    /* block here for tx descriptor */
+        __u64             kib_next_tx_cookie;   /* RDMA completion cookie */
+        spinlock_t        kib_tx_lock;          /* serialise */
+        
+        IB_HANDLE         kib_hca;              /* The HCA */
+        int               kib_port;             /* port on the device */
+        IB_HANDLE         kib_pd;               /* protection domain */
+        IB_HANDLE         kib_sd;               /* SD handle */
+        IB_HANDLE         kib_cq;               /* completion queue */
+        kib_md_t          kib_md;               /* full-mem registration */
+
+        void             *kib_listen_handle;    /* where I listen for connections */
+
+        IBT_INTERFACE_UNION kib_interfaces;     /* The Infinicon IBT interface */
+
+        uint64              kib_hca_guids[8];   /* all the HCA guids */
+        IB_CA_ATTRIBUTES    kib_hca_attrs;      /* where to get HCA attrs */
+        FABRIC_OPERATION_DATA kib_fabopdata;    /* (un)advertise service record */
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING         0
+#define IBNAL_INIT_DATA            1
+#define IBNAL_INIT_LIB             2
+#define IBNAL_INIT_HCA             3
+#define IBNAL_INIT_PORTATTRS       4
+#define IBNAL_INIT_PORT            5
+#define IBNAL_INIT_SD              6
+#define IBNAL_INIT_PD              7
+#define IBNAL_INIT_FMR             8
+#define IBNAL_INIT_MR              9
+#define IBNAL_INIT_TXD             10 
+#define IBNAL_INIT_CQ              11 
+#define IBNAL_INIT_ALL             12 
+
+/************************************************************************
+ * Wire message structs.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD
+ * private data and SM service info), is LE on the wire.
+ */
+
+/* also kib_md_t above */
+
+typedef struct
+{
+        __u32                 rd_key;           /* remote key */
+        __u32                 rd_nob;           /* # of bytes */
+        __u64                 rd_addr;          /* remote io vaddr */
+} kib_rdma_desc_t __attribute__((packed));
+
+typedef struct
+{
+        ptl_hdr_t         ibim_hdr;             /* portals header */
+        char              ibim_payload[0];      /* piggy-backed payload */
+} kib_immediate_msg_t __attribute__((packed));
+
+/* these arrays serve two purposes during rdma.  they are built on the passive
+ * side and sent to the active side as remote arguments.  On the active side
+ * the descs are used as a data structure on the way to local gather items. 
+ * the different roles result in split local/remote meaning of desc->rd_key */
+typedef struct
+{
+        ptl_hdr_t         ibrm_hdr;             /* portals header */
+        __u64             ibrm_cookie;          /* opaque completion cookie */
+        __u32             ibrm_num_descs;       /* how many descs */
+        kib_rdma_desc_t   ibrm_desc[0];         /* where to suck/blow */
+} kib_rdma_msg_t __attribute__((packed));
+
+#define kib_rdma_msg_len(num_descs) \
+        offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs])
+
+typedef struct
+{
+        __u64             ibcm_cookie;          /* opaque completion cookie */
+        __u32             ibcm_status;          /* completion status */
+} kib_completion_msg_t __attribute__((packed));
+
+typedef struct
+{
+        __u32              ibm_magic;           /* I'm an openibnal message */
+        __u16              ibm_version;         /* this is my version number */
+        __u8               ibm_type;            /* msg type */
+        __u8               ibm_credits;         /* returned credits */
+#if IBNAL_CKSUM
+        __u32              ibm_nob;
+        __u32              ibm_cksum;
+#endif
+        union {
+                kib_immediate_msg_t   immediate;
+                kib_rdma_msg_t        rdma;
+                kib_completion_msg_t  completion;
+        } ibm_u __attribute__((packed));
+} kib_msg_t __attribute__((packed));
+
+#define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
+#define IBNAL_MSG_VERSION              1        /* current protocol version */
+
+#define IBNAL_MSG_NOOP              0xd0        /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE         0xd1        /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA          0xd2        /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE          0xd3        /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA          0xd4        /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE          0xd5        /* signal GET rdma completion */
+
+/***********************************************************************/
+
+typedef struct kib_rx                           /* receive message */
+{
+        struct list_head          rx_list;      /* queue for attention */
+        struct kib_conn          *rx_conn;      /* owning conn */
+        int                       rx_rdma;      /* RDMA completion posted? */
+        int                       rx_posted;    /* posted? */
+        __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+        kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
+        IB_WORK_REQ               rx_wrq;
+        IB_LOCAL_DATASEGMENT      rx_gl;        /* and it's memory */
+} kib_rx_t;
+
+typedef struct kib_tx                           /* transmit message */
+{
+        struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+        int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
+        struct kib_conn          *tx_conn;      /* owning conn */
+        int                       tx_mapped;    /* mapped for RDMA? */
+        int                       tx_sending;   /* # tx callbacks outstanding */
+        int                       tx_status;    /* completion status */
+        unsigned long             tx_deadline;  /* completion deadline */
+        int                       tx_passive_rdma; /* peer sucks/blows */
+        int                       tx_passive_rdma_wait; /* waiting for peer to complete */
+        __u64                     tx_passive_rdma_cookie; /* completion cookie */
+        lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
+        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
+        __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+        kib_msg_t                *tx_msg;       /* pre-mapped buffer (host vaddr) */
+        int                       tx_nsp;       /* # send work items */
+        IB_WORK_REQ               tx_wrq[IBNAL_TX_MAX_SG];    /* send work items... */
+        IB_LOCAL_DATASEGMENT      tx_gl[IBNAL_TX_MAX_SG];     /* ...and their memory */
+} kib_tx_t;
+
+#define KIB_TX_UNMAPPED       0
+#define KIB_TX_MAPPED         1
+#define KIB_TX_MAPPED_FMR     2
+
+typedef struct kib_wire_connreq
+{
+        __u32        wcr_magic;                 /* I'm an openibnal connreq */
+        __u16        wcr_version;               /* this is my version number */
+        __u16        wcr_queue_depth;           /* this is my receive queue size */
+        __u64        wcr_nid;                   /* peer's NID */
+        __u64        wcr_incarnation;           /* peer's incarnation */
+} kib_wire_connreq_t;
+
+typedef struct kib_gid
+{
+        __u64   hi, lo;
+} kib_gid_t;
+
+typedef struct kib_connreq
+{
+        /* connection-in-progress */
+        struct kib_conn                    *cr_conn;
+        kib_wire_connreq_t                  cr_wcr;
+        __u64                               cr_tid;
+        IB_SERVICE_RECORD                   cr_service;
+        kib_gid_t                           cr_gid;
+        IB_PATH_RECORD                      cr_path;
+        CM_REQUEST_INFO                     cr_cmreq;
+        CM_CONN_INFO                        cr_discarded;
+        CM_REJECT_INFO                      cr_rej_info;
+} kib_connreq_t;
+
+typedef struct kib_conn
+{ 
+        struct kib_peer    *ibc_peer;           /* owning peer */
+        struct list_head    ibc_list;           /* stash on peer's conn list */
+        __u64               ibc_incarnation;    /* which instance of the peer */
+        atomic_t            ibc_refcount;       /* # users */
+        int                 ibc_state;          /* what's happening */
+        atomic_t            ibc_nob;            /* # bytes buffered */
+        int                 ibc_nsends_posted;  /* # uncompleted sends */
+        int                 ibc_credits;        /* # credits I have */
+        int                 ibc_outstanding_credits; /* # credits to return */
+        int                 ibc_rcvd_disconnect;/* received discon request */
+        int                 ibc_sent_disconnect;/* sent discon request */
+        struct list_head    ibc_tx_queue;       /* send queue */
+        struct list_head    ibc_active_txs;     /* active tx awaiting completion */
+        spinlock_t          ibc_lock;           /* serialise */
+        kib_rx_t           *ibc_rxs;            /* the rx descs */
+        kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
+        IB_HANDLE           ibc_qp;             /* queue pair */
+        IB_HANDLE           ibc_cep;            /* connection ID? */
+        IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs;    /* QP attrs */
+        kib_connreq_t      *ibc_connreq;        /* connection request state */
+} kib_conn_t;
+
+#define IBNAL_CONN_INIT_NOTHING      0          /* initial state */
+#define IBNAL_CONN_INIT_QP           1          /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING        2          /* started to connect */
+#define IBNAL_CONN_ESTABLISHED       3          /* connection established */
+#define IBNAL_CONN_SEND_DREQ         4          /* to send disconnect req */
+#define IBNAL_CONN_DREQ              5          /* sent disconnect req */
+#define IBNAL_CONN_DREP              6          /* sent disconnect rep */
+#define IBNAL_CONN_DISCONNECTED      7          /* no more QP or CM traffic */
+
+#define KIB_ASSERT_CONN_STATE(conn, state) do {                         \
+        LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state);  \
+} while (0)
+
+#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do {               \
+        LASSERTF(low <= high, "%d %d\n", low, high);                    \
+        LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \
+                 "%d\n", conn->ibc_state);                              \
+} while (0)
+
+typedef struct kib_peer
+{
+        struct list_head    ibp_list;           /* stash on global peer list */
+        struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
+        ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
+        atomic_t            ibp_refcount;       /* # users */
+        int                 ibp_persistence;    /* "known" peer refs */
+        struct list_head    ibp_conns;          /* all active connections */
+        struct list_head    ibp_tx_queue;       /* msgs waiting for a conn */
+        int                 ibp_connecting;     /* connecting+accepting */
+        unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
+        unsigned long       ibp_reconnect_interval; /* exponential backoff */
+} kib_peer_t;
+
+
+extern lib_nal_t       kibnal_lib;
+extern kib_data_t      kibnal_data;
+extern kib_tunables_t  kibnal_tunables;
+
+/******************************************************************************/
+/* Infinicon IBT interface wrappers */
+#define IIBT_IF (kibnal_data.kib_interfaces.ver2)
+
+static inline FSTATUS
+iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list)
+{
+        return IIBT_IF.GetCaGuids(hca_count, hca_guid_list);
+}
+
+static inline FSTATUS
+iibt_open_hca(EUI64                    hca_guid, 
+             IB_COMPLETION_CALLBACK   completion_callback,
+             IB_ASYNC_EVENT_CALLBACK  async_event_callback,
+             void                    *arg,
+             IB_HANDLE               *handle)
+{
+        return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback,
+                                  async_event_callback, arg, handle);
+}
+
+static inline FSTATUS
+iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp)
+{
+        return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp);
+}
+
+static inline FSTATUS
+iibt_close_hca(IB_HANDLE hca_handle)
+{
+        return IIBT_IF.Vpi.CloseCA(hca_handle);
+}
+
+static inline FSTATUS
+iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle)
+{
+        return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle);
+}
+
+static inline FSTATUS
+iibt_pd_free(IB_HANDLE pd_handle)
+{
+        return IIBT_IF.Vpi.FreePD(pd_handle);
+}
+
+static inline FSTATUS
+iibt_register_physical_memory(IB_HANDLE hca_handle, 
+                              IB_VIRT_ADDR requested_io_va,
+                              void *phys_buffers, uint64 nphys_buffers,
+                              uint32 io_va_offset, IB_HANDLE pd_handle,
+                              IB_ACCESS_CONTROL access,
+                              IB_HANDLE *mem_handle, 
+                              IB_VIRT_ADDR *actual_io_va,
+                              IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+        return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va,
+                                                 phys_buffers, nphys_buffers,
+                                                 io_va_offset, pd_handle, 
+                                                 access,
+                                                 mem_handle, actual_io_va,
+                                                 lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_contig_physical_memory(IB_HANDLE hca_handle, 
+                                     IB_VIRT_ADDR requested_io_va,
+                                     IB_MR_PHYS_BUFFER *phys_buffers, 
+                                     uint64 nphys_buffers,
+                                     uint32 io_va_offset, IB_HANDLE pd_handle,
+                                     IB_ACCESS_CONTROL access,
+                                     IB_HANDLE *mem_handle, 
+                                     IB_VIRT_ADDR *actual_io_va,
+                                     IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+        return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle, 
+                                                       requested_io_va,
+                                                       phys_buffers, 
+                                                       nphys_buffers,
+                                                       io_va_offset, pd_handle, 
+                                                       access,
+                                                       mem_handle, actual_io_va,
+                                                       lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_memory(IB_HANDLE hca_handle, 
+                     void *virt_addr, unsigned int length,
+                     IB_HANDLE pd_handle,
+                     IB_ACCESS_CONTROL access,
+                     IB_HANDLE *mem_handle, 
+                     IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+        return IIBT_IF.Vpi.RegisterMemRegion(hca_handle, 
+                                             virt_addr, length,
+                                             pd_handle, 
+                                             access,
+                                             mem_handle,
+                                             lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_deregister_memory(IB_HANDLE mem_handle)
+{
+        return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle);
+}
+
+static inline FSTATUS
+iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size,
+              void *arg, IB_HANDLE *cq_handle, uint32 *actual_size)
+{
+        return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size,
+                                   arg, cq_handle, actual_size);
+}
+
+static inline FSTATUS
+iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc)
+{
+        return IIBT_IF.Vpi.PollCQ(cq_handle, wc);
+}
+
+static inline FSTATUS
+iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select)
+{
+        return IIBT_IF.Vpi.RearmCQ(cq_handle, select);
+}
+
+static inline FSTATUS
+iibt_cq_destroy(IB_HANDLE cq_handle)
+{
+        return IIBT_IF.Vpi.DestroyCQ(cq_handle);
+}
+
+static inline FSTATUS
+iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr,
+              void *arg, IB_HANDLE *cq_handle, 
+              IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+        return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle, 
+                                    query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr,
+              void **arg_ptr)
+{
+        return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr);
+}
+
+static inline FSTATUS
+iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr,
+               IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+        return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_destroy(IB_HANDLE qp_handle)
+{
+        return IIBT_IF.Vpi.DestroyQP(qp_handle);
+}
+
+static inline FSTATUS
+iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+        return IIBT_IF.Vpi.PostRecv(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+        return IIBT_IF.Vpi.PostSend(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p)
+{
+        return IIBT_IF.Sdi.Register(sd_handle, p);
+}
+
+static inline FSTATUS
+iibt_sd_deregister(IB_HANDLE sd_handle)
+{
+        return IIBT_IF.Sdi.Deregister(sd_handle);
+}
+
+static inline FSTATUS
+iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid,
+                              FABRIC_OPERATION_DATA *fod,
+                              PFABRIC_OPERATION_CALLBACK callback,
+                              COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+        return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid,
+                                               fod, callback, p, arg);
+}
+
+static inline FSTATUS
+iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid,
+                                      QUERY *qry,
+                                      PQUERY_CALLBACK callback,
+                                      COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+        return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid,
+                                                      qry, callback, p, arg);
+}
+
+static inline IB_HANDLE
+iibt_cm_create_cep(CM_CEP_TYPE type)
+{
+        return IIBT_IF.Cmi.CmCreateCEP(type);
+}
+
+static inline FSTATUS
+iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len,
+                   uint32 offset)
+{
+        return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset);
+}
+
+static inline FSTATUS
+iibt_cm_destroy_cep(IB_HANDLE cep_handle)
+{
+        return IIBT_IF.Cmi.CmDestroyCEP(cep_handle);
+}
+
+static inline FSTATUS
+iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info,
+               PFN_CM_CALLBACK callback, void *arg)
+{
+        return IIBT_IF.Cmi.CmListen(cep, info, callback, arg);
+}
+
+static inline FSTATUS
+iibt_cm_cancel(IB_HANDLE cep)
+{
+        return IIBT_IF.Cmi.CmCancel(cep);
+}
+
+static inline FSTATUS
+iibt_cm_accept(IB_HANDLE cep, 
+               CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info,
+               PFN_CM_CALLBACK callback, void *arg,
+               IB_HANDLE *new_cep)
+{
+        return IIBT_IF.Cmi.CmAccept(cep,
+                                    send_info, recv_info,
+                                    callback, arg, new_cep);
+}
+
+static inline FSTATUS
+iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej)
+{
+        return IIBT_IF.Cmi.CmReject(cep, rej);
+}
+
+static inline FSTATUS
+iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req,
+                   CM_DREPLY_INFO *reply)
+{
+        return IIBT_IF.Cmi.CmDisconnect(cep, req, reply);
+}
+
+static inline FSTATUS
+iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req,
+                 PFN_CM_CALLBACK callback, void *arg)
+{
+        return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg);
+}
+
+static inline int wrq_signals_completion(IB_WORK_REQ *wrq)
+{
+        return wrq->Req.SendRC.Options.s.SignaledCompletion == 1;
+}
+
+
+/******************************************************************************/
+
+/* these are purposely avoiding using local vars so they don't increase
+ * stack consumption. */
+
+#define kib_peer_addref(peer) do {                                      \
+        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
+                 atomic_read(&peer->ibp_refcount));                     \
+        CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n",                   \
+               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+        atomic_inc(&peer->ibp_refcount);                                \
+} while (0)
+
+#define kib_peer_decref(peer) do {                                      \
+        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
+                 atomic_read(&peer->ibp_refcount));                     \
+        CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n",                   \
+               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+        if (atomic_dec_and_test (&peer->ibp_refcount)) {                \
+                CDEBUG (D_NET, "destroying peer "LPX64" %p\n",          \
+                        peer->ibp_nid, peer);                           \
+                kibnal_destroy_peer (peer);                             \
+        }                                                               \
+} while (0)
+
+/******************************************************************************/
+
+static inline struct list_head *
+kibnal_nid2peerlist (ptl_nid_t nid) 
+{
+        unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
+        
+        return (&kibnal_data.kib_peers [hash]);
+}
+
+static inline int
+kibnal_peer_active(kib_peer_t *peer)
+{
+        /* Am I in the peer hash table? */
+        return (!list_empty(&peer->ibp_list));
+}
+
+static inline void
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+        /* CAVEAT EMPTOR: tx takes caller's ref on conn */
+
+        LASSERT (tx->tx_nsp > 0);               /* work items set up */
+        LASSERT (tx->tx_conn == NULL);          /* only set here */
+
+        tx->tx_conn = conn;
+        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
+        list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+}
+
+#define KIBNAL_SERVICE_KEY_MASK  (IB_SERVICE_RECORD_COMP_SERVICENAME |          \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_8)
+
+static inline __u64*
+kibnal_service_nid_field(IB_SERVICE_RECORD *srv)
+{
+        /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
+        return (__u64 *)srv->ServiceData8;
+}
+
+
+static inline void
+kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid)
+{
+        LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName));
+        memset (srv->ServiceName, 0, sizeof(srv->ServiceName));
+        strcpy (srv->ServiceName, IBNAL_SERVICE_NAME);
+
+        *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
+}
+
+#if 0
+static inline void
+kibnal_show_rdma_attr (kib_conn_t *conn)
+{
+        struct ib_qp_attribute qp_attr;
+        int                    rc;
+        
+        memset (&qp_attr, 0, sizeof(qp_attr));
+        rc = ib_qp_query(conn->ibc_qp, &qp_attr);
+        if (rc != 0) {
+                CERROR ("Can't get qp attrs: %d\n", rc);
+                return;
+        }
+        
+        CWARN ("RDMA CAPABILITY: write %s read %s\n",
+               (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+               (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid",
+               (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+               (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid");
+}
+#endif
+
+#if CONFIG_X86
+static inline __u64
+kibnal_page2phys (struct page *p)
+{
+        __u64 page_number = p - mem_map;
+        
+        return (page_number << PAGE_SHIFT);
+}
+#else
+# error "no page->phys"
+#endif
+
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive.  It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+        unsigned long lptr = (unsigned long)ptr;
+
+        LASSERT ((lptr & 1) == 0);
+        return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+        return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+        return (wreqid & 1) != 0;
+}
+
+static inline int
+kibnal_whole_mem(void)
+{
+        return kibnal_data.kib_md.md_handle != NULL;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_destroy_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer, 
+                                              __u64 incarnation);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg);
+
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
+
+extern void kibnal_check_sends (kib_conn_t *conn);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int  kibnal_scheduler(void *arg);
+extern int  kibnal_connd (void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern void kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status, 
+                                      kib_rx_t *rx, lib_msg_t *libmsg, 
+                                      unsigned int niov, 
+                                      struct iovec *iov, ptl_kiov_t *kiov,
+                                      size_t offset, size_t nob);
+
+void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev);
+void kibnal_ca_callback (void *ca_arg, void *cq_arg);
diff --git a/lustre/portals/knals/iibnal/iibnal_cb.c b/lustre/portals/knals/iibnal/iibnal_cb.c
new file mode 100644 (file)
index 0000000..a827ba5
--- /dev/null
@@ -0,0 +1,3018 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+/*
+ *  LIB functions follow
+ *
+ */
+static void
+kibnal_schedule_tx_done (kib_tx_t *tx)
+{
+        unsigned long flags;
+
+        spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
+
+        list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+        wake_up (&kibnal_data.kib_sched_waitq);
+
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
+static void
+kibnal_tx_done (kib_tx_t *tx)
+{
+        ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
+        unsigned long    flags;
+        int              i;
+        FSTATUS          frc;
+
+        LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
+        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
+
+        switch (tx->tx_mapped) {
+        default:
+                LBUG();
+
+        case KIB_TX_UNMAPPED:
+                break;
+
+        case KIB_TX_MAPPED:
+                if (in_interrupt()) {
+                        /* can't deregister memory in IRQ context... */
+                        kibnal_schedule_tx_done(tx);
+                        return;
+                }
+                frc = iibt_deregister_memory(tx->tx_md.md_handle);
+                LASSERT (frc == FSUCCESS);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+                break;
+
+#if IBNAL_FMR
+        case KIB_TX_MAPPED_FMR:
+                if (in_interrupt() && tx->tx_status != 0) {
+                        /* can't flush FMRs in IRQ context... */
+                        kibnal_schedule_tx_done(tx);
+                        return;
+                }              
+
+                rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
+                LASSERT (rc == 0);
+
+                if (tx->tx_status != 0)
+                        ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+                break;
+#endif
+        }
+
+        for (i = 0; i < 2; i++) {
+                /* tx may have up to 2 libmsgs to finalise */
+                if (tx->tx_libmsg[i] == NULL)
+                        continue;
+
+                lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+                tx->tx_libmsg[i] = NULL;
+        }
+        
+        if (tx->tx_conn != NULL) {
+                kibnal_put_conn (tx->tx_conn);
+                tx->tx_conn = NULL;
+        }
+
+        tx->tx_nsp = 0;
+        tx->tx_passive_rdma = 0;
+        tx->tx_status = 0;
+
+        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+        if (tx->tx_isnblk) {
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
+        } else {
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+                wake_up (&kibnal_data.kib_idle_tx_waitq);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+}
+
+static kib_tx_t *
+kibnal_get_idle_tx (int may_block) 
+{
+        unsigned long  flags;
+        kib_tx_t      *tx = NULL;
+        ENTRY;
+        
+        for (;;) {
+                spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+                /* "normal" descriptor is free */
+                if (!list_empty (&kibnal_data.kib_idle_txs)) {
+                        tx = list_entry (kibnal_data.kib_idle_txs.next,
+                                         kib_tx_t, tx_list);
+                        break;
+                }
+
+                if (!may_block) {
+                        /* may dip into reserve pool */
+                        if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
+                                CERROR ("reserved tx desc pool exhausted\n");
+                                break;
+                        }
+
+                        tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+                                         kib_tx_t, tx_list);
+                        break;
+                }
+
+                /* block for idle tx */
+                spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+                wait_event (kibnal_data.kib_idle_tx_waitq,
+                            !list_empty (&kibnal_data.kib_idle_txs) ||
+                            kibnal_data.kib_shutdown);
+        }
+
+        if (tx != NULL) {
+                list_del (&tx->tx_list);
+
+                /* Allocate a new passive RDMA completion cookie.  It might
+                 * not be needed, but we've got a lock right now and we're
+                 * unlikely to wrap... */
+                tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
+
+                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+                LASSERT (tx->tx_nsp == 0);
+                LASSERT (tx->tx_sending == 0);
+                LASSERT (tx->tx_status == 0);
+                LASSERT (tx->tx_conn == NULL);
+                LASSERT (!tx->tx_passive_rdma);
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_libmsg[0] == NULL);
+                LASSERT (tx->tx_libmsg[1] == NULL);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+        
+        RETURN(tx);
+}
+
+static int
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if kibnal_get_peer (nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->libnal_ni.ni_pid.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+static void
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+{
+        struct list_head *ttmp;
+        unsigned long     flags;
+        int               idle;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (!tx->tx_passive_rdma_wait ||
+                    tx->tx_passive_rdma_cookie != cookie)
+                        continue;
+
+                CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+
+                tx->tx_status = status;
+                tx->tx_passive_rdma_wait = 0;
+                idle = (tx->tx_sending == 0);
+
+                if (idle)
+                        list_del (&tx->tx_list);
+
+                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+                /* I could be racing with tx callbacks.  It's whoever
+                 * _makes_ tx idle that frees it */
+                if (idle)
+                        kibnal_tx_done (tx);
+                return;
+        }
+                
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
+                cookie, conn->ibc_peer->ibp_nid);
+}
+
+static __u32
+kibnal_lkey(kib_pages_t *ibp)
+{
+        if (kibnal_whole_mem())
+                return kibnal_data.kib_md.md_lkey;
+
+        return ibp->ibp_lkey;
+}
+
+static void
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
+{
+        kib_conn_t   *conn = rx->rx_conn;
+        int           rc = 0;
+        unsigned long flags;
+        FSTATUS       frc;
+        ENTRY;
+
+        rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
+                .Address = rx->rx_vaddr,
+                .Length  = IBNAL_MSG_SIZE,
+                .Lkey    = kibnal_lkey(conn->ibc_rx_pages),
+        };
+
+        rx->rx_wrq = (IB_WORK_REQ) {
+                .Operation              = WROpRecv,
+                .DSListDepth            = 1,
+                .MessageLen             = IBNAL_MSG_SIZE,
+                .WorkReqId              = kibnal_ptr2wreqid(rx, 1),
+                .DSList                 = &rx->rx_gl,
+        };
+
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
+                                    IBNAL_CONN_DREP);
+        LASSERT (!rx->rx_posted);
+        rx->rx_posted = 1;
+        mb();
+
+        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+                rc = -ECONNABORTED;
+        else {
+                frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
+                if (frc != FSUCCESS) {
+                        CDEBUG(D_NET, "post failed %d\n", frc);
+                        rc = -EINVAL;
+                }
+                CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+        }
+
+        if (rc == 0) {
+                if (do_credits) {
+                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        conn->ibc_outstanding_credits++;
+                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+                        kibnal_check_sends(conn);
+                }
+                EXIT;
+                return;
+        }
+
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                CERROR ("Error posting receive -> "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, rc);
+                kibnal_close_conn (rx->rx_conn, rc);
+        } else {
+                CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, rc);
+        }
+
+        /* Drop rx's ref */
+        kibnal_put_conn (conn);
+        EXIT;
+}
+
+#if IBNAL_CKSUM
+static inline __u32 kibnal_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+        
+        return (sum);
+}
+#endif
+
+static void hexdump(char *string, void *ptr, int len)
+{
+        unsigned char *c = ptr;
+        int i;
+
+        return;
+
+        if (len < 0 || len > 2048)  {
+                printk("XXX what the hell? %d\n",len);
+                return;
+        }
+
+        printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+
+        for (i = 0; i < len;) {
+                printk("%02x",*(c++));
+                i++;
+                if (!(i & 15)) {
+                        printk("\n");
+                } else if (!(i&1)) {
+                        printk(" ");
+                }
+        }
+
+        if(len & 15) {
+                printk("\n");
+        }
+}
+
+static void
+kibnal_rx_callback (IB_WORK_COMPLETION *wc)
+{
+        kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+        int           nob = wc->Length;
+        const int     base_nob = offsetof(kib_msg_t, ibm_u);
+        int           credits;
+        int           flipped;
+        unsigned long flags;
+        __u32         i;
+#if IBNAL_CKSUM
+        __u32         msg_cksum;
+        __u32         computed_cksum;
+#endif
+
+        /* we set the QP to erroring after we've finished disconnecting, 
+         * maybe we should do so sooner. */
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, 
+                                    IBNAL_CONN_DISCONNECTED);
+
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+        LASSERT (rx->rx_posted);
+        rx->rx_posted = 0;
+        mb();
+
+        /* receives complete with error in any case after we've started
+         * disconnecting */
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+                goto failed;
+
+        if (wc->Status != WRStatusSuccess) {
+                CERROR("Rx from "LPX64" failed: %d\n", 
+                       conn->ibc_peer->ibp_nid, wc->Status);
+                goto failed;
+        }
+
+        if (nob < base_nob) {
+                CERROR ("Short rx from "LPX64": %d < expected %d\n",
+                        conn->ibc_peer->ibp_nid, nob, base_nob);
+                goto failed;
+        }
+
+        hexdump("rx", rx->rx_msg, sizeof(kib_msg_t));
+
+        /* Receiver does any byte flipping if necessary... */
+
+        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+                flipped = 0;
+        } else {
+                if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+                        CERROR ("Unrecognised magic: %08x from "LPX64"\n", 
+                                msg->ibm_magic, conn->ibc_peer->ibp_nid);
+                        goto failed;
+                }
+                flipped = 1;
+                __swab16s (&msg->ibm_version);
+                LASSERT (sizeof(msg->ibm_type) == 1);
+                LASSERT (sizeof(msg->ibm_credits) == 1);
+        }
+
+        if (msg->ibm_version != IBNAL_MSG_VERSION) {
+                CERROR ("Incompatible msg version %d (%d expected)\n",
+                        msg->ibm_version, IBNAL_MSG_VERSION);
+                goto failed;
+        }
+
+#if IBNAL_CKSUM
+        if (nob != msg->ibm_nob) {
+                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
+                goto failed;
+        }
+
+        msg_cksum = le32_to_cpu(msg->ibm_cksum);
+        msg->ibm_cksum = 0;
+        computed_cksum = kibnal_cksum (msg, nob);
+        
+        if (msg_cksum != computed_cksum) {
+                CERROR ("Checksum failure %d: (%d expected)\n",
+                        computed_cksum, msg_cksum);
+//                goto failed;
+        }
+        CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
+#endif
+
+        /* Have I received credits that will let me send? */
+        credits = msg->ibm_credits;
+        if (credits != 0) {
+                spin_lock_irqsave(&conn->ibc_lock, flags);
+                conn->ibc_credits += credits;
+                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                
+                kibnal_check_sends(conn);
+        }
+
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_NOOP:
+                kibnal_post_rx (rx, 1);
+                return;
+
+        case IBNAL_MSG_IMMEDIATE:
+                if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
+                        CERROR ("Short IMMEDIATE from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                break;
+                
+        case IBNAL_MSG_PUT_RDMA:
+        case IBNAL_MSG_GET_RDMA:
+                if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
+                        CERROR ("Short RDMA msg from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                if (flipped) 
+                        __swab32(msg->ibm_u.rdma.ibrm_num_descs);
+
+                CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
+                       msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
+
+                if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) ||
+                    (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > 
+                     min(nob, IBNAL_MSG_SIZE))) {
+                        CERROR ("num_descs %d too large\n", 
+                                msg->ibm_u.rdma.ibrm_num_descs);
+                        goto failed;
+                }
+
+                for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
+                        kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
+
+                        if (flipped) {
+                                __swab32(desc->rd_key);
+                                __swab32(desc->rd_nob);
+                                __swab64(desc->rd_addr);
+                        }
+
+                        CDEBUG(D_NET, "  key %x, " "addr "LPX64", nob %u\n",
+                               desc->rd_key, desc->rd_addr, desc->rd_nob);
+                }
+                break;
+                        
+        case IBNAL_MSG_PUT_DONE:
+        case IBNAL_MSG_GET_DONE:
+                if (nob < base_nob + sizeof (kib_completion_msg_t)) {
+                        CERROR ("Short COMPLETION msg from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                if (flipped)
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
+                
+                CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
+                       msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+                       msg->ibm_u.completion.ibcm_status);
+
+                kibnal_complete_passive_rdma (conn, 
+                                              msg->ibm_u.completion.ibcm_cookie,
+                                              msg->ibm_u.completion.ibcm_status);
+                kibnal_post_rx (rx, 1);
+                return;
+                        
+        default:
+                CERROR ("Can't parse type from "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, msg->ibm_type);
+                goto failed;
+        }
+
+        /* schedule for kibnal_rx() in thread context */
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+        
+        list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+        wake_up (&kibnal_data.kib_sched_waitq);
+        
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+        return;
+        
+ failed:
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+        kibnal_close_conn(conn, -ECONNABORTED);
+
+        /* Don't re-post rx & drop its ref on conn */
+        kibnal_put_conn(conn);
+}
+
+void
+kibnal_rx (kib_rx_t *rx)
+{
+        kib_msg_t   *msg = rx->rx_msg;
+
+        /* Clear flag so I can detect if I've sent an RDMA completion */
+        rx->rx_rdma = 0;
+
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_GET_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+                /* If the incoming get was matched, I'll have initiated the
+                 * RDMA and the completion message... */
+                if (rx->rx_rdma)
+                        break;
+
+                /* Otherwise, I'll send a failed completion now to prevent
+                 * the peer's GET blocking for the full timeout. */
+                CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+                                          rx, NULL, 0, NULL, NULL, 0, 0);
+                break;
+                
+        case IBNAL_MSG_PUT_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+                if (rx->rx_rdma)
+                        break;
+                /* This is most unusual, since even if lib_parse() didn't
+                 * match anything, it should have asked us to read (and
+                 * discard) the payload.  The portals header must be
+                 * inconsistent with this message type, so it's the
+                 * sender's fault for sending garbage and she can time
+                 * herself out... */
+                CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                break;
+
+        case IBNAL_MSG_IMMEDIATE:
+                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
+                LASSERT (!rx->rx_rdma);
+                break;
+                
+        default:
+                LBUG();
+                break;
+        }
+
+        kibnal_post_rx (rx, 1);
+}
+
+static struct page *
+kibnal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END)
+                page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+        else if (vaddr >= PKMAP_BASE &&
+                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+                page = vmalloc_to_page ((void *)vaddr);
+        /* in 2.4 ^ just walks the page tables */
+#endif
+        else
+                page = virt_to_page (vaddr);
+
+        if (!VALID_PAGE (page))
+                page = NULL;
+
+        return page;
+}
+
+static void
+kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
+                 unsigned long len, int active)
+{
+        kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
+        kib_rdma_desc_t *desc;
+
+        LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", 
+                 ibrm->ibrm_num_descs);
+
+        desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
+        if (active)
+                desc->rd_key = kibnal_data.kib_md.md_lkey;
+        else
+                desc->rd_key = kibnal_data.kib_md.md_rkey;
+        desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
+        desc->rd_addr = kibnal_page2phys(page) + page_offset +
+                        kibnal_data.kib_md.md_addr;
+
+        ibrm->ibrm_num_descs++;
+}
+
+static int
+kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
+{
+        struct page *page;
+        int page_offset, len;
+
+        while (nob > 0) {
+                page = kibnal_kvaddr_to_page(vaddr);
+                if (page == NULL)
+                        return -EFAULT;
+
+                page_offset = vaddr & (PAGE_SIZE - 1);
+                len = min(nob, (int)PAGE_SIZE - page_offset);
+                
+                kibnal_fill_ibrm(tx, page, page_offset, len, active);
+                nob -= len;
+                vaddr += len;
+        }
+        return 0;
+}
+
+static int
+kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+                 int niov, struct iovec *iov, int offset, int nob, int active)
+                 
+{
+        void   *vaddr;
+        FSTATUS frc;
+
+        LASSERT (nob > 0);
+        LASSERT (niov > 0);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+                LASSERT (niov > 0);
+        }
+
+        if (nob > iov->iov_len - offset) {
+                CERROR ("Can't map multiple vaddr fragments\n");
+                return (-EMSGSIZE);
+        }
+
+        /* our large contiguous iov could be backed by multiple physical
+         * pages. */
+        if (kibnal_whole_mem()) {
+                int rc;
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+                rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + 
+                                         offset, nob, active);
+                if (rc != 0) {
+                        CERROR ("Can't map iov: %d\n", rc);
+                        return rc;
+                }
+                return 0;
+        }
+
+        vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
+        tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+
+        frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob,
+                                   kibnal_data.kib_pd, access,
+                                   &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
+                                   &tx->tx_md.md_rkey);
+        if (frc != 0) {
+                CERROR ("Can't map vaddr %p: %d\n", vaddr, frc);
+                return -EINVAL;
+        }
+
+        tx->tx_mapped = KIB_TX_MAPPED;
+        return (0);
+}
+
+static int
+kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+                  int nkiov, ptl_kiov_t *kiov,
+                  int offset, int nob, int active)
+{
+        __u64                      *phys = NULL;
+        int                         page_offset;
+        int                         nphys;
+        int                         resid;
+        int                         phys_size = 0;
+        FSTATUS                     frc;
+        int                         i, rc = 0;
+
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
+        }
+
+        page_offset = kiov->kiov_offset + offset;
+        nphys = 1;
+
+        if (!kibnal_whole_mem()) {
+                phys_size = nkiov * sizeof (*phys);
+                PORTAL_ALLOC(phys, phys_size);
+                if (phys == NULL) {
+                        CERROR ("Can't allocate tmp phys\n");
+                        return (-ENOMEM);
+                }
+
+                phys[0] = kibnal_page2phys(kiov->kiov_page);
+        } else {
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+                kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, 
+                                 kiov->kiov_len, active);
+        }
+
+        resid = nob - (kiov->kiov_len - offset);
+
+        while (resid > 0) {
+                kiov++;
+                nkiov--;
+                LASSERT (nkiov > 0);
+
+                if (kiov->kiov_offset != 0 ||
+                    ((resid > PAGE_SIZE) && 
+                     kiov->kiov_len < PAGE_SIZE)) {
+                        /* Can't have gaps */
+                        CERROR ("Can't make payload contiguous in I/O VM:"
+                                "page %d, offset %d, len %d \n", nphys, 
+                                kiov->kiov_offset, kiov->kiov_len);
+
+                        for (i = -nphys; i < nkiov; i++) 
+                        {
+                                CERROR("kiov[%d] %p +%d for %d\n",
+                                       i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
+                        }
+                        
+                        rc = -EINVAL;
+                        goto out;
+                }
+
+                if (nphys == PTL_MD_MAX_IOV) {
+                        CERROR ("payload too big (%d)\n", nphys);
+                        rc = -EMSGSIZE;
+                        goto out;
+                }
+
+                if (!kibnal_whole_mem()) {
+                        LASSERT (nphys * sizeof (*phys) < phys_size);
+                        phys[nphys] = kibnal_page2phys(kiov->kiov_page);
+                } else {
+                        if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
+                                CERROR ("payload too big (%d)\n", nphys);
+                                rc = -EMSGSIZE;
+                                goto out;
+                        }
+                        kibnal_fill_ibrm(tx, kiov->kiov_page, 
+                                         kiov->kiov_offset, kiov->kiov_len,
+                                         active);
+                }
+
+                nphys ++;
+                resid -= PAGE_SIZE;
+        }
+
+        if (kibnal_whole_mem())
+                goto out;
+
+#if 0
+        CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
+        for (i = 0; i < nphys; i++)
+                CWARN ("   [%d] "LPX64"\n", i, phys[i]);
+#endif
+
+#if IBNAL_FMR
+#error "iibnal hasn't learned about FMR yet"
+        rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
+                                       phys, nphys,
+                                       &tx->tx_md.md_addr,
+                                       page_offset,
+                                       &tx->tx_md.md_handle.fmr,
+                                       &tx->tx_md.md_lkey,
+                                       &tx->tx_md.md_rkey);
+#else
+        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+                                            IBNAL_RDMA_BASE,
+                                            phys, nphys,
+                                            0,          /* offset */
+                                            kibnal_data.kib_pd,
+                                            access,
+                                            &tx->tx_md.md_handle,
+                                            &tx->tx_md.md_addr,
+                                            &tx->tx_md.md_lkey,
+                                            &tx->tx_md.md_rkey);
+#endif
+        if (frc == FSUCCESS) {
+                CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
+                       nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
+#if IBNAL_FMR
+                tx->tx_mapped = KIB_TX_MAPPED_FMR;
+#else
+                tx->tx_mapped = KIB_TX_MAPPED;
+#endif
+        } else {
+                CERROR ("Can't map phys: %d\n", rc);
+                rc = -EFAULT;
+        }
+
+ out:
+        if (phys != NULL)
+                PORTAL_FREE(phys, phys_size);
+        return (rc);
+}
+
+static kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
+{
+        struct list_head *tmp;
+
+        /* just return the first connection */
+        list_for_each (tmp, &peer->ibp_conns) {
+                return (list_entry(tmp, kib_conn_t, ibc_list));
+        }
+
+        return (NULL);
+}
+
+void
+kibnal_check_sends (kib_conn_t *conn)
+{
+        unsigned long   flags;
+        kib_tx_t       *tx;
+        int             rc;
+        int             i;
+        int             done;
+        int             nwork;
+        ENTRY;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
+        if (list_empty(&conn->ibc_tx_queue) &&
+            conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
+                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                
+                tx = kibnal_get_idle_tx(0);     /* don't block */
+                if (tx != NULL)
+                        kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
+
+                spin_lock_irqsave(&conn->ibc_lock, flags);
+                
+                if (tx != NULL) {
+                        atomic_inc(&conn->ibc_refcount);
+                        kibnal_queue_tx_locked(tx, conn);
+                }
+        }
+
+        while (!list_empty (&conn->ibc_tx_queue)) {
+                tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+
+                /* We rely on this for QP sizing */
+                LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+
+                LASSERT (conn->ibc_outstanding_credits >= 0);
+                LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_credits >= 0);
+                LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
+
+                /* Not on ibc_rdma_queue */
+                LASSERT (!tx->tx_passive_rdma_wait);
+
+                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
+                        GOTO(out, 0);
+
+                if (conn->ibc_credits == 0)     /* no credits */
+                        GOTO(out, 1);
+                
+                if (conn->ibc_credits == 1 &&   /* last credit reserved for */
+                    conn->ibc_outstanding_credits == 0) /* giving back credits */
+                        GOTO(out, 2);
+
+                list_del (&tx->tx_list);
+
+                if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
+                    (!list_empty(&conn->ibc_tx_queue) ||
+                     conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+                        /* redundant NOOP */
+                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                        kibnal_tx_done(tx);
+                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        continue;
+                }
+
+                tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
+                conn->ibc_outstanding_credits = 0;
+
+                conn->ibc_nsends_posted++;
+                conn->ibc_credits--;
+
+                /* we only get a tx completion for the final rdma op */ 
+                tx->tx_sending = min(tx->tx_nsp, 2);
+                tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+                list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+                tx->tx_msg->ibm_cksum = 0;
+                tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
+#endif
+                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+                /* NB the gap between removing tx from the queue and sending it
+                 * allows message re-ordering to occur */
+
+                LASSERT (tx->tx_nsp > 0);
+
+                rc = -ECONNABORTED;
+                nwork = 0;
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                        tx->tx_status = 0;
+                        /* Driver only accepts 1 item at a time */
+                        for (i = 0; i < tx->tx_nsp; i++) {
+                                hexdump("tx", tx->tx_msg, sizeof(kib_msg_t));
+                                rc = iibt_postsend(conn->ibc_qp, 
+                                                   &tx->tx_wrq[i]);
+                                if (rc != 0)
+                                        break;
+                                if (wrq_signals_completion(&tx->tx_wrq[i]))
+                                        nwork++;
+                                CDEBUG(D_NET, "posted tx wrq %p\n", 
+                                       &tx->tx_wrq[i]);
+                        }
+                }
+
+                spin_lock_irqsave (&conn->ibc_lock, flags);
+                if (rc != 0) {
+                        /* NB credits are transferred in the actual
+                         * message, which can only be the last work item */
+                        conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
+                        conn->ibc_credits++;
+                        conn->ibc_nsends_posted--;
+
+                        tx->tx_status = rc;
+                        tx->tx_passive_rdma_wait = 0;
+                        tx->tx_sending -= tx->tx_nsp - nwork;
+
+                        done = (tx->tx_sending == 0);
+                        if (done)
+                                list_del (&tx->tx_list);
+                        
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        
+                        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+                                CERROR ("Error %d posting transmit to "LPX64"\n", 
+                                        rc, conn->ibc_peer->ibp_nid);
+                        else
+                                CDEBUG (D_NET, "Error %d posting transmit to "
+                                        LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+
+                        kibnal_close_conn (conn, rc);
+
+                        if (done)
+                                kibnal_tx_done (tx);
+                        return;
+                }
+                
+        }
+
+        EXIT;
+out:
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+}
+
+static void
+kibnal_tx_callback (IB_WORK_COMPLETION *wc)
+{
+        kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+        kib_conn_t   *conn;
+        unsigned long flags;
+        int           idle;
+
+        conn = tx->tx_conn;
+        LASSERT (conn != NULL);
+        LASSERT (tx->tx_sending != 0);
+
+        spin_lock_irqsave(&conn->ibc_lock, flags);
+
+        CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
+               tx->tx_sending, tx->tx_nsp, wc->Status);
+
+        /* I could be racing with rdma completion.  Whoever makes 'tx' idle
+         * gets to free it, which also drops its ref on 'conn'.  If it's
+         * not me, then I take an extra ref on conn so it can't disappear
+         * under me. */
+
+        tx->tx_sending--;
+        idle = (tx->tx_sending == 0) &&         /* This is the final callback */
+               (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
+        if (idle)
+                list_del(&tx->tx_list);
+
+        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+               atomic_read (&conn->ibc_refcount));
+        atomic_inc (&conn->ibc_refcount);
+
+        if (tx->tx_sending == 0)
+                conn->ibc_nsends_posted--;
+
+        if (wc->Status != WRStatusSuccess &&
+            tx->tx_status == 0)
+                tx->tx_status = -ECONNABORTED;
+                
+        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+        if (idle)
+                kibnal_tx_done (tx);
+
+        if (wc->Status != WRStatusSuccess) {
+                CERROR ("Tx completion to "LPX64" failed: %d\n", 
+                        conn->ibc_peer->ibp_nid, wc->Status);
+                kibnal_close_conn (conn, -ENETDOWN);
+        } else {
+                /* can I shovel some more sends out the door? */
+                kibnal_check_sends(conn);
+        }
+
+        kibnal_put_conn (conn);
+}
+
+void 
+kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev)
+{
+        /* XXX flesh out.  this seems largely for async errors */
+        CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
+}
+
+void
+kibnal_ca_callback (void *ca_arg, void *cq_arg)
+{
+        IB_HANDLE cq = *(IB_HANDLE *)cq_arg;
+        IB_HANDLE ca = *(IB_HANDLE *)ca_arg;
+        IB_WORK_COMPLETION wc;
+        int armed = 0;
+
+        CDEBUG(D_NET, "ca %p cq %p\n", ca, cq);
+
+        for(;;) {
+                while (iibt_cq_poll(cq, &wc) == FSUCCESS) {
+                        if (kibnal_wreqid_is_rx(wc.WorkReqId))
+                                kibnal_rx_callback(&wc);
+                        else
+                                kibnal_tx_callback(&wc);
+                }
+                if (armed)
+                        return;
+                if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) {
+                        CERROR("rearm failed?\n");
+                        return;
+                }
+                armed = 1;
+        }
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
+{
+        IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp];
+        IB_WORK_REQ         *wrq = &tx->tx_wrq[tx->tx_nsp];
+        int                       fence;
+        int                       nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+
+        LASSERT (tx->tx_nsp >= 0 && 
+                 tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+        LASSERT (nob <= IBNAL_MSG_SIZE);
+        
+        tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+        tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+        tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+        tx->tx_msg->ibm_nob = nob;
+#endif
+        /* Fence the message if it's bundled with an RDMA read */
+        fence = (tx->tx_nsp > 0) &&
+                (type == IBNAL_MSG_PUT_DONE);
+
+        *gl = (IB_LOCAL_DATASEGMENT) {
+                .Address = tx->tx_vaddr,
+                .Length  = IBNAL_MSG_SIZE,
+                .Lkey    = kibnal_lkey(kibnal_data.kib_tx_pages),
+        };
+
+        wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
+        wrq->Operation      = WROpSend;
+        wrq->DSList         = gl;
+        wrq->DSListDepth    = 1;
+        wrq->MessageLen     = nob;
+        wrq->Req.SendRC.ImmediateData  = 0;
+        wrq->Req.SendRC.Options.s.SolicitedEvent         = 1;
+        wrq->Req.SendRC.Options.s.SignaledCompletion     = 1;
+        wrq->Req.SendRC.Options.s.ImmediateData          = 0;
+        wrq->Req.SendRC.Options.s.Fence                  = fence;
+
+        tx->tx_nsp++;
+}
+
+static void
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+        unsigned long         flags;
+
+        spin_lock_irqsave(&conn->ibc_lock, flags);
+
+        kibnal_queue_tx_locked (tx, conn);
+        
+        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+        
+        kibnal_check_sends(conn);
+}
+
+static void
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
+{
+        unsigned long    flags;
+        kib_peer_t      *peer;
+        kib_conn_t      *conn;
+        rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
+
+        /* If I get here, I've committed to send, so I complete the tx with
+         * failure on any problems */
+        
+        LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
+        LASSERT (tx->tx_nsp > 0);               /* work items have been set up */
+
+        read_lock (g_lock);
+        
+        peer = kibnal_find_peer_locked (nid);
+        if (peer == NULL) {
+                read_unlock (g_lock);
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+                return;
+        }
+
+        conn = kibnal_find_conn_locked (peer);
+        if (conn != NULL) {
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                read_unlock (g_lock);
+                
+                kibnal_queue_tx (tx, conn);
+                return;
+        }
+        
+        /* Making one or more connections; I'll need a write lock... */
+        read_unlock (g_lock);
+        write_lock_irqsave (g_lock, flags);
+
+        peer = kibnal_find_peer_locked (nid);
+        if (peer == NULL) {
+                write_unlock_irqrestore (g_lock, flags);
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+                return;
+        }
+
+        conn = kibnal_find_conn_locked (peer);
+        if (conn != NULL) {
+                /* Connection exists; queue message on it */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                write_unlock_irqrestore (g_lock, flags);
+                
+                kibnal_queue_tx (tx, conn);
+                return;
+        }
+
+        if (peer->ibp_connecting == 0) {
+                if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
+                        write_unlock_irqrestore (g_lock, flags);
+                        tx->tx_status = -EHOSTUNREACH;
+                        kibnal_tx_done (tx);
+                        return;
+                }
+        
+                peer->ibp_connecting = 1;
+                kib_peer_addref(peer); /* extra ref for connd */
+        
+                spin_lock (&kibnal_data.kib_connd_lock);
+        
+                list_add_tail (&peer->ibp_connd_list,
+                               &kibnal_data.kib_connd_peers);
+                wake_up (&kibnal_data.kib_connd_waitq);
+        
+                spin_unlock (&kibnal_data.kib_connd_lock);
+        }
+        
+        /* A connection is being established; queue the message... */
+        list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
+
+        write_unlock_irqrestore (g_lock, flags);
+}
+
+static ptl_err_t
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
+                            lib_msg_t *libmsg, ptl_hdr_t *hdr)
+{
+        int         nob = libmsg->md->length;
+        kib_tx_t   *tx;
+        kib_msg_t  *ibmsg;
+        int         rc;
+        IB_ACCESS_CONTROL         access = {0,};
+        
+        LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
+        LASSERT (nob > 0);
+        LASSERT (!in_interrupt());              /* Mapping could block */
+
+        access.s.MWBindable = 1;
+        access.s.LocalWrite = 1;
+        access.s.RdmaRead = 1;
+        access.s.RdmaWrite = 1;
+
+        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
+        LASSERT (tx != NULL);
+
+        if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
+                rc = kibnal_map_iov (tx, access,
+                                     libmsg->md->md_niov,
+                                     libmsg->md->md_iov.iov,
+                                     0, nob, 0);
+        else
+                rc = kibnal_map_kiov (tx, access,
+                                      libmsg->md->md_niov, 
+                                      libmsg->md->md_iov.kiov,
+                                      0, nob, 0);
+
+        if (rc != 0) {
+                CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
+                goto failed;
+        }
+        
+        if (type == IBNAL_MSG_GET_RDMA) {
+                /* reply gets finalized when tx completes */
+                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, 
+                                                        nid, libmsg);
+                if (tx->tx_libmsg[1] == NULL) {
+                        CERROR ("Can't create reply for GET -> "LPX64"\n",
+                                nid);
+                        rc = -ENOMEM;
+                        goto failed;
+                }
+        }
+        
+        tx->tx_passive_rdma = 1;
+
+        ibmsg = tx->tx_msg;
+
+        ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+        ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+        /* map_kiov alrady filled the rdma descs for the whole_mem case */
+        if (!kibnal_whole_mem()) {
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey;
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+                ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
+        }
+
+        kibnal_init_tx_msg (tx, type, 
+                            kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
+
+        CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
+               LPX64", nob %d\n",
+               tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
+               tx->tx_md.md_addr, nob);
+        
+        /* libmsg gets finalized when tx completes. */
+        tx->tx_libmsg[0] = libmsg;
+
+        kibnal_launch_tx(tx, nid);
+        return (PTL_OK);
+
+ failed:
+        tx->tx_status = rc;
+        kibnal_tx_done (tx);
+        return (PTL_FAIL);
+}
+
+void
+kibnal_start_active_rdma (int type, int status,
+                           kib_rx_t *rx, lib_msg_t *libmsg, 
+                           unsigned int niov,
+                           struct iovec *iov, ptl_kiov_t *kiov,
+                           size_t offset, size_t nob)
+{
+        kib_msg_t    *rxmsg = rx->rx_msg;
+        kib_msg_t    *txmsg;
+        kib_tx_t     *tx;
+        IB_ACCESS_CONTROL access = {0,};
+        IB_WR_OP      rdma_op;
+        int           rc;
+        __u32         i;
+
+        CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
+               type, status, niov, offset, nob);
+
+        /* Called by scheduler */
+        LASSERT (!in_interrupt ());
+
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+
+        /* No data if we're completing with failure */
+        LASSERT (status == 0 || nob == 0);
+
+        LASSERT (type == IBNAL_MSG_GET_DONE ||
+                 type == IBNAL_MSG_PUT_DONE);
+
+        /* Flag I'm completing the RDMA.  Even if I fail to send the
+         * completion message, I will have tried my best so further
+         * attempts shouldn't be tried. */
+        LASSERT (!rx->rx_rdma);
+        rx->rx_rdma = 1;
+
+        if (type == IBNAL_MSG_GET_DONE) {
+                rdma_op  = WROpRdmaWrite;
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
+        } else {
+                access.s.LocalWrite = 1;
+                rdma_op  = WROpRdmaRead;
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
+        }
+
+        tx = kibnal_get_idle_tx (0);           /* Mustn't block */
+        if (tx == NULL) {
+                CERROR ("tx descs exhausted on RDMA from "LPX64
+                        " completing locally with failure\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+                return;
+        }
+        LASSERT (tx->tx_nsp == 0);
+                        
+        if (nob == 0) 
+                GOTO(init_tx, 0);
+
+        /* We actually need to transfer some data (the transfer
+         * size could get truncated to zero when the incoming
+         * message is matched) */
+        if (kiov != NULL)
+                rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
+        else
+                rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
+        
+        if (rc != 0) {
+                CERROR ("Can't map RDMA -> "LPX64": %d\n", 
+                        rx->rx_conn->ibc_peer->ibp_nid, rc);
+                /* We'll skip the RDMA and complete with failure. */
+                status = rc;
+                nob = 0;
+                GOTO(init_tx, rc);
+        } 
+
+        if (!kibnal_whole_mem()) {
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey;
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
+        }
+
+        /* XXX ugh.  different page-sized hosts. */ 
+        if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
+            rxmsg->ibm_u.rdma.ibrm_num_descs) {
+                CERROR("tx descs (%u) != rx descs (%u)\n", 
+                       tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
+                       rxmsg->ibm_u.rdma.ibrm_num_descs);
+                /* We'll skip the RDMA and complete with failure. */
+                status = rc;
+                nob = 0;
+                GOTO(init_tx, rc);
+        }
+
+        /* map_kiov filled in the rdma descs which describe our side of the
+         * rdma transfer. */
+        /* ibrm_num_descs was verified in rx_callback */
+        for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
+                kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
+                IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i];
+                IB_WORK_REQ  *wrq = &tx->tx_wrq[i];
+
+                ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
+                rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
+
+                ds->Address = ldesc->rd_addr;
+                ds->Length  = ldesc->rd_nob;
+                ds->Lkey    = ldesc->rd_key;
+
+                memset(wrq, 0, sizeof(*wrq));
+                wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
+                wrq->Operation      = rdma_op;
+                wrq->DSList         = ds;
+                wrq->DSListDepth    = 1;
+                wrq->MessageLen     = ds->Length;
+                wrq->Req.SendRC.ImmediateData  = 0;
+                wrq->Req.SendRC.Options.s.SolicitedEvent         = 0;
+                wrq->Req.SendRC.Options.s.SignaledCompletion     = 0;
+                wrq->Req.SendRC.Options.s.ImmediateData          = 0;
+                wrq->Req.SendRC.Options.s.Fence                  = 0;
+                wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr;
+                wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key;
+
+                /* only the last rdma post triggers tx completion */
+                if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
+                        wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+
+                tx->tx_nsp++;
+        }
+
+init_tx:
+        txmsg = tx->tx_msg;
+
+        txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+        txmsg->ibm_u.completion.ibcm_status = status;
+        
+        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+
+        if (status == 0 && nob != 0) {
+                LASSERT (tx->tx_nsp > 1);
+                /* RDMA: libmsg gets finalized when the tx completes.  This
+                 * is after the completion message has been sent, which in
+                 * turn is after the RDMA has finished. */
+                tx->tx_libmsg[0] = libmsg;
+        } else {
+                LASSERT (tx->tx_nsp == 1);
+                /* No RDMA: local completion happens now! */
+                CDEBUG(D_WARNING,"No data: immediate completion\n");
+                lib_finalize (&kibnal_lib, NULL, libmsg,
+                              status == 0 ? PTL_OK : PTL_FAIL);
+        }
+
+        /* +1 ref for this tx... */
+        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+               rx->rx_conn, rx->rx_conn->ibc_state, 
+               rx->rx_conn->ibc_peer->ibp_nid,
+               atomic_read (&rx->rx_conn->ibc_refcount));
+        atomic_inc (&rx->rx_conn->ibc_refcount);
+        /* ...and queue it up */
+        kibnal_queue_tx(tx, rx->rx_conn);
+}
+
+static ptl_err_t
+kibnal_sendmsg(lib_nal_t    *nal, 
+                void         *private,
+                lib_msg_t    *libmsg,
+                ptl_hdr_t    *hdr, 
+                int           type, 
+                ptl_nid_t     nid, 
+                ptl_pid_t     pid,
+                unsigned int  payload_niov, 
+                struct iovec *payload_iov, 
+                ptl_kiov_t   *payload_kiov,
+                size_t        payload_offset,
+                size_t        payload_nob)
+{
+        kib_msg_t  *ibmsg;
+        kib_tx_t   *tx;
+        int         nob;
+
+        /* NB 'private' is different depending on what we're sending.... */
+
+        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
+               " pid %d\n", payload_nob, payload_niov, nid , pid);
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        /* Thread context if we're sending payload */
+        LASSERT (!in_interrupt() || payload_niov == 0);
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+        switch (type) {
+        default:
+                LBUG();
+                return (PTL_FAIL);
+                
+        case PTL_MSG_REPLY: {
+                /* reply's 'private' is the incoming receive */
+                kib_rx_t *rx = private;
+
+                /* RDMA reply expected? */
+                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+                                                 rx, libmsg, payload_niov, 
+                                                 payload_iov, payload_kiov,
+                                                 payload_offset, payload_nob);
+                        return (PTL_OK);
+                }
+                
+                /* Incoming message consistent with immediate reply? */
+                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
+                        CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
+                                nid, rx->rx_msg->ibm_type);
+                        return (PTL_FAIL);
+                }
+
+                /* Will it fit in a message? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob >= IBNAL_MSG_SIZE) {
+                        CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", 
+                               nid, payload_nob);
+                        return (PTL_FAIL);
+                }
+                break;
+        }
+
+        case PTL_MSG_GET:
+                /* might the REPLY message be big enough to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
+                                                          nid, libmsg, hdr));
+                break;
+
+        case PTL_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
+
+        case PTL_MSG_PUT:
+                /* Is the payload big enough to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+                                                          nid, libmsg, hdr));
+                
+                break;
+        }
+
+        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt()));
+        if (tx == NULL) {
+                CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
+                        type, nid, in_interrupt() ? " (intr)" : "");
+                return (PTL_NO_SPACE);
+        }
+
+        ibmsg = tx->tx_msg;
+        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+        if (payload_nob > 0) {
+                if (payload_kiov != NULL)
+                        lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+                                          payload_niov, payload_kiov,
+                                          payload_offset, payload_nob);
+                else
+                        lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+                                         payload_niov, payload_iov,
+                                         payload_offset, payload_nob);
+        }
+
+        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+                            offsetof(kib_immediate_msg_t, 
+                                     ibim_payload[payload_nob]));
+
+        /* libmsg gets finalized when tx completes */
+        tx->tx_libmsg[0] = libmsg;
+
+        kibnal_launch_tx(tx, nid);
+        return (PTL_OK);
+}
+
+static ptl_err_t
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+               unsigned int payload_niov, struct iovec *payload_iov,
+               size_t payload_offset, size_t payload_len)
+{
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, payload_iov, NULL,
+                               payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
+                     size_t payload_offset, size_t payload_len)
+{
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, NULL, payload_kiov,
+                               payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+                 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
+                 size_t offset, size_t mlen, size_t rlen)
+{
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        int          msg_nob;
+        
+        LASSERT (mlen <= rlen);
+        LASSERT (!in_interrupt ());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+
+        switch (rxmsg->ibm_type) {
+        default:
+                LBUG();
+                return (PTL_FAIL);
+                
+        case IBNAL_MSG_IMMEDIATE:
+                msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (msg_nob > IBNAL_MSG_SIZE) {
+                        CERROR ("Immediate message from "LPX64" too big: %d\n",
+                                rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
+                        return (PTL_FAIL);
+                }
+
+                if (kiov != NULL)
+                        lib_copy_buf2kiov(niov, kiov, offset,
+                                          rxmsg->ibm_u.immediate.ibim_payload,
+                                          mlen);
+                else
+                        lib_copy_buf2iov(niov, iov, offset,
+                                         rxmsg->ibm_u.immediate.ibim_payload,
+                                         mlen);
+
+                lib_finalize (nal, NULL, libmsg, PTL_OK);
+                return (PTL_OK);
+
+        case IBNAL_MSG_GET_RDMA:
+                /* We get called here just to discard any junk after the
+                 * GET hdr. */
+                LASSERT (libmsg == NULL);
+                lib_finalize (nal, NULL, libmsg, PTL_OK);
+                return (PTL_OK);
+
+        case IBNAL_MSG_PUT_RDMA:
+                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+                                          rx, libmsg, 
+                                          niov, iov, kiov, offset, mlen);
+                return (PTL_OK);
+        }
+}
+
+static ptl_err_t
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+              unsigned int niov, struct iovec *iov, 
+              size_t offset, size_t mlen, size_t rlen)
+{
+        return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+                                offset, mlen, rlen));
+}
+
+static ptl_err_t
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+                     unsigned int niov, ptl_kiov_t *kiov, 
+                     size_t offset, size_t mlen, size_t rlen)
+{
+        return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+                                offset, mlen, rlen));
+}
+
+/*****************************************************************************
+ * the rest of this file concerns connection management.  active connetions
+ * start with connect_peer, passive connections start with passive_callback.
+ * active disconnects start with conn_close, cm_callback starts passive
+ * disconnects and contains the guts of how the disconnect state machine
+ * progresses. 
+ *****************************************************************************/
+
+int
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&kibnal_data.kib_nthreads);
+        return (0);
+}
+
+static void
+kibnal_thread_fini (void)
+{
+        atomic_dec (&kibnal_data.kib_nthreads);
+}
+
+/* this can be called by anyone at any time to close a connection.  if
+ * the connection is still established it heads to the connd to start
+ * the disconnection in a safe context.  It has no effect if called
+ * on a connection that is already disconnecting */
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+        /* This just does the immmediate housekeeping, and schedules the
+         * connection for the connd to finish off.
+         * Caller holds kib_global_lock exclusively in irq context */
+        kib_peer_t   *peer = conn->ibc_peer;
+
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
+                                    IBNAL_CONN_DISCONNECTED);
+
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+                return; /* already disconnecting */
+
+        CDEBUG (error == 0 ? D_NET : D_ERROR,
+                "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
+
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                /* kib_connd_conns takes ibc_list's ref */
+                list_del (&conn->ibc_list);
+        } else {
+                /* new ref for kib_connd_conns */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount);
+        }
+        
+        if (list_empty (&peer->ibp_conns) &&
+            peer->ibp_persistence == 0) {
+                /* Non-persistent peer with no more conns... */
+                kibnal_unlink_peer_locked (peer);
+        }
+
+        conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+
+        spin_lock (&kibnal_data.kib_connd_lock);
+
+        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+                
+        spin_unlock (&kibnal_data.kib_connd_lock);
+}
+
+void
+kibnal_close_conn (kib_conn_t *conn, int error)
+{
+        unsigned long     flags;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        kibnal_close_conn_locked (conn, error);
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+}
+
+static void
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
+{
+        LIST_HEAD        (zombies);
+        kib_tx_t         *tx;
+        unsigned long     flags;
+
+        LASSERT (rc != 0);
+        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        LASSERT (peer->ibp_connecting != 0);
+        peer->ibp_connecting--;
+
+        if (peer->ibp_connecting != 0) {
+                /* another connection attempt under way (loopback?)... */
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+                return;
+        }
+
+        if (list_empty(&peer->ibp_conns)) {
+                /* Say when active connection can be re-attempted */
+                peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
+                /* Increase reconnection interval */
+                peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
+                                                    IBNAL_MAX_RECONNECT_INTERVAL);
+        
+                /* Take peer's blocked blocked transmits; I'll complete
+                 * them with error */
+                while (!list_empty (&peer->ibp_tx_queue)) {
+                        tx = list_entry (peer->ibp_tx_queue.next,
+                                         kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+                        list_add_tail (&tx->tx_list, &zombies);
+                }
+                
+                if (kibnal_peer_active(peer) &&
+                    (peer->ibp_persistence == 0)) {
+                        /* failed connection attempt on non-persistent peer */
+                        kibnal_unlink_peer_locked (peer);
+                }
+        } else {
+                /* Can't have blocked transmits if there are connections */
+                LASSERT (list_empty(&peer->ibp_tx_queue));
+        }
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        if (!list_empty (&zombies))
+                CERROR ("Deleting messages for "LPX64": connection failed\n",
+                        peer->ibp_nid);
+
+        while (!list_empty (&zombies)) {
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                list_del (&tx->tx_list);
+                /* complete now */
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+        }
+}
+
+static void
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
+{
+        int               state = conn->ibc_state;
+        kib_peer_t       *peer = conn->ibc_peer;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               i;
+
+        /* passive connection has no connreq & vice versa */
+        LASSERTF(!active == !(conn->ibc_connreq != NULL),
+                 "%d %p\n", active, conn->ibc_connreq);
+        if (active) {
+                PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+                conn->ibc_connreq = NULL;
+        }
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        LASSERT (peer->ibp_connecting != 0);
+        
+        if (status == 0) {                         
+                /* connection established... */
+                KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
+                conn->ibc_state = IBNAL_CONN_ESTABLISHED;
+
+                if (!kibnal_peer_active(peer)) {
+                        /* ...but peer deleted meantime */
+                        status = -ECONNABORTED;
+                }
+        } else {
+                KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
+                                            IBNAL_CONN_CONNECTING);
+        }
+
+        if (status == 0) {
+                /* Everything worked! */
+
+                peer->ibp_connecting--;
+
+                /* +1 ref for ibc_list; caller(== CM)'s ref remains until
+                 * the IB_CM_IDLE callback */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount);
+                list_add (&conn->ibc_list, &peer->ibp_conns);
+                
+                /* reset reconnect interval for next attempt */
+                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+                /* post blocked sends to the new connection */
+                spin_lock (&conn->ibc_lock);
+                
+                while (!list_empty (&peer->ibp_tx_queue)) {
+                        tx = list_entry (peer->ibp_tx_queue.next, 
+                                         kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+
+                        /* +1 ref for each tx */
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+                        atomic_inc (&conn->ibc_refcount);
+                        kibnal_queue_tx_locked (tx, conn);
+                }
+                
+                spin_unlock (&conn->ibc_lock);
+
+                /* Nuke any dangling conns from a different peer instance... */
+                kibnal_close_stale_conns_locked (conn->ibc_peer,
+                                                 conn->ibc_incarnation);
+
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+                /* queue up all the receives */
+                for (i = 0; i < IBNAL_RX_MSGS; i++) {
+                        /* +1 ref for rx desc */
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+                        atomic_inc (&conn->ibc_refcount);
+
+                        CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
+                               i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
+                               conn->ibc_rxs[i].rx_vaddr);
+
+                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
+                }
+
+                kibnal_check_sends (conn);
+                return;
+        }
+
+        /* connection failed */
+        if (state == IBNAL_CONN_CONNECTING) {
+                /* schedule for connd to close */
+                kibnal_close_conn_locked (conn, status);
+        } else {
+                /* Don't have a CM comm_id; just wait for refs to drain */
+                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+        } 
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+
+        /* If we didn't establish the connection we don't have to pass
+         * through the disconnect protocol before dropping the CM ref */
+        if (state < IBNAL_CONN_CONNECTING) 
+                kibnal_put_conn (conn);
+}
+
+static int
+kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
+                ptl_nid_t nid, __u64 incarnation, int queue_depth)
+{
+        kib_conn_t    *conn = kibnal_create_conn();
+        kib_peer_t    *peer;
+        kib_peer_t    *peer2;
+        unsigned long  flags;
+
+        if (conn == NULL)
+                return (-ENOMEM);
+
+        if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
+                CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
+                       nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
+                atomic_dec (&conn->ibc_refcount);
+                kibnal_destroy_conn(conn);
+                return (-EPROTO);
+        }
+        
+        /* assume 'nid' is a new peer */
+        peer = kibnal_create_peer (nid);
+        if (peer == NULL) {
+                CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_dec (&conn->ibc_refcount);
+                kibnal_destroy_conn(conn);
+                return (-ENOMEM);
+        }
+        
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        peer2 = kibnal_find_peer_locked(nid);
+        if (peer2 == NULL) {
+                /* peer table takes my ref on peer */
+                list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
+        } else {
+                kib_peer_decref (peer);
+                peer = peer2;
+        }
+
+        kib_peer_addref(peer); /* +1 ref for conn */
+        peer->ibp_connecting++;
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        conn->ibc_peer = peer;
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
+        /* conn->ibc_cep is set when cm_accept is called */
+        conn->ibc_incarnation = incarnation;
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+        *connp = conn;
+        return (0);
+}
+
+static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state)
+{
+        IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,};
+        FSTATUS frc;
+
+        modify_attr.RequestState = state;
+
+        frc = iibt_qp_modify(qp, &modify_attr, NULL);
+        if (frc != FSUCCESS)
+                CERROR("couldn't set qp state to %d, error %d\n", state, frc);
+}
+
+static void kibnal_flush_pending(kib_conn_t *conn)
+{
+        LIST_HEAD        (zombies); 
+        struct list_head *tmp;
+        struct list_head *nxt;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               done;
+
+        /* NB we wait until the connection has closed before completing
+         * outstanding passive RDMAs so we can be sure the network can't 
+         * touch the mapped memory any more. */
+        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
+
+        /* set the QP to the error state so that we get flush callbacks
+         * on our posted receives which can then drop their conn refs */
+        kibnal_set_qp_state(conn->ibc_qp, QPStateError);
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        /* grab passive RDMAs not waiting for the tx callback */
+        list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                /* still waiting for tx callback? */
+                if (!tx->tx_passive_rdma_wait)
+                        continue;
+
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_passive_rdma_wait = 0;
+                done = (tx->tx_sending == 0);
+
+                if (!done)
+                        continue;
+
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+
+        /* grab all blocked transmits */
+        list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+                
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+        
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        while (!list_empty(&zombies)) {
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                list_del(&tx->tx_list);
+                kibnal_tx_done (tx);
+        }
+}
+
+static void
+kibnal_reject (IB_HANDLE cep, uint16_t reason)
+{
+        CM_REJECT_INFO *rej;
+
+        PORTAL_ALLOC(rej, sizeof(*rej));
+        if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */
+                return;  
+
+        rej->Reason = reason;
+        iibt_cm_reject(cep, rej);
+        PORTAL_FREE(rej, sizeof(*rej));
+}
+
+static FSTATUS
+kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, 
+              IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn)
+{
+        IB_QP_ATTRIBUTES_MODIFY modify_attr;
+        FSTATUS frc;
+        ENTRY;
+
+        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState           = QPStateReadyToRecv,
+                .RecvPSN                = IBNAL_STARTING_PSN,
+                .DestQPNumber           = qpn,
+                .ResponderResources     = resp_res,
+                .MinRnrTimer            = UsecToRnrNakTimer(2000), /* 20 ms */
+                .Attrs                  = (IB_QP_ATTR_RECVPSN |
+                                           IB_QP_ATTR_DESTQPNUMBER | 
+                                           IB_QP_ATTR_RESPONDERRESOURCES | 
+                                           IB_QP_ATTR_DESTAV | 
+                                           IB_QP_ATTR_PATHMTU | 
+                                           IB_QP_ATTR_MINRNRTIMER),
+        };
+        GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, 
+                      &modify_attr.DestAV);
+
+        frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+        if (frc != FSUCCESS) 
+                RETURN(frc);
+
+        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState           = QPStateReadyToSend,
+                .FlowControl            = TRUE,
+                .InitiatorDepth         = init_depth,
+                .SendPSN                = send_psn,
+                .LocalAckTimeout        = path->PktLifeTime + 2, /* 2 or 1? */
+                .RetryCount             = IBNAL_RETRY,
+                .RnrRetryCount          = IBNAL_RNR_RETRY,
+                .Attrs                  = (IB_QP_ATTR_FLOWCONTROL | 
+                                           IB_QP_ATTR_INITIATORDEPTH | 
+                                           IB_QP_ATTR_SENDPSN | 
+                                           IB_QP_ATTR_LOCALACKTIMEOUT | 
+                                           IB_QP_ATTR_RETRYCOUNT | 
+                                           IB_QP_ATTR_RNRRETRYCOUNT),
+        };
+
+        frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+        RETURN(frc);
+}
+
+static void
+kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+        kib_conn_t *conn = arg;
+        kib_wire_connreq_t *wcr;
+        CM_REPLY_INFO *rep = &info->Info.Reply;
+        uint16_t reason;
+        FSTATUS frc;
+
+        wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData;
+
+        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+                CERROR ("Can't connect "LPX64": bad magic %08x\n",
+                        conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+        
+        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+                CERROR ("Can't connect "LPX64": bad version %d\n",
+                        conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+                        
+        if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
+                CERROR ("Can't connect "LPX64": bad queue depth %d\n",
+                        conn->ibc_peer->ibp_nid, 
+                        le16_to_cpu(wcr->wcr_queue_depth));
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+                        
+        if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
+                CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
+                        le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+
+        CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+        conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+        frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, 
+                            min_t(__u8, rep->ArbInitiatorDepth,
+                                  ca_attr->MaxQPResponderResources),
+                            &conn->ibc_connreq->cr_path, 
+                            min_t(__u8, rep->ArbResponderResources,
+                                  ca_attr->MaxQPInitiatorDepth),
+                            rep->StartingPSN);
+        if (frc != FSUCCESS) {
+                CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n",
+                       conn, conn->ibc_peer->ibp_nid, frc);
+                GOTO(reject, reason = RC_NO_QP);
+        }
+
+        /* the callback arguments are ignored for an active accept */
+        conn->ibc_connreq->cr_discarded.Status = FSUCCESS;
+        frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, 
+                             NULL, NULL, NULL, NULL);
+        if (frc != FCM_CONNECT_ESTABLISHED) {
+                CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n",
+                       conn, conn->ibc_peer->ibp_nid, frc);
+                kibnal_connreq_done (conn, 1, -ECONNABORTED);
+                /* XXX don't call reject after accept fails? */
+                return;
+        }
+
+        CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+        kibnal_connreq_done (conn, 1, 0);
+        return;
+
+reject:
+        kibnal_reject(cep, reason);
+        kibnal_connreq_done (conn, 1, -EPROTO);
+}
+
+/* ib_cm.h has a wealth of information on the CM procedures */
+static void
+kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        kib_conn_t       *conn = arg;
+
+        CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+        /* Established Connection Notifier */
+        switch (info->Status) {
+        default:
+                CERROR("unknown status %d on Connection %p -> "LPX64"\n",
+                       info->Status, conn, conn->ibc_peer->ibp_nid);
+                LBUG();
+                break;
+
+        case FCM_CONNECT_REPLY:
+                kibnal_connect_reply(cep, info, arg);
+                break;
+
+        case FCM_DISCONNECT_REQUEST:
+                /* XXX lock around these state management bits? */
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+                        kibnal_close_conn (conn, 0);
+                conn->ibc_state = IBNAL_CONN_DREP;
+                iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+                break;
+
+        /* these both guarantee that no more cm callbacks will occur */
+        case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */
+        case FCM_DISCONNECT_REPLY:
+                CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n",
+                       conn, conn->ibc_peer->ibp_nid);
+
+                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+                kibnal_flush_pending(conn);
+                kibnal_put_conn(conn);        /* Lose CM's ref */
+                break;
+        }
+
+        return;
+}
+
+static int
+kibnal_set_cm_flags(IB_HANDLE cep)
+{
+        FSTATUS frc;
+        uint32 value = 1;
+
+        frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
+                                 (char *)&value, sizeof(value), 0);
+        if (frc != FSUCCESS) {
+                CERROR("error setting timeout callback: %d\n", frc);
+                return -1;
+        }
+
+#if 0
+        frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
+                                 sizeof(value), 0);
+        if (frc != FSUCCESS) {
+                CERROR("error setting async accept: %d\n", frc);
+                return -1;
+        }
+#endif
+
+        return 0;
+}
+
+void
+kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+        IB_QP_ATTRIBUTES_QUERY *query;
+        CM_REQUEST_INFO    *req;
+        CM_CONN_INFO       *rep = NULL, *rcv = NULL;
+        kib_wire_connreq_t *wcr;
+        kib_conn_t         *conn = NULL;
+        uint16_t            reason = 0;
+        FSTATUS             frc;
+        int                 rc = 0;
+        
+        LASSERT(cep);
+        LASSERT(info);
+        LASSERT(arg == NULL); /* no conn yet for passive */
+
+        CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+        req = &info->Info.Request;
+        wcr = (kib_wire_connreq_t *)req->PrivateData;
+
+        CDEBUG(D_NET, "%d from "LPX64"\n", info->Status, 
+               le64_to_cpu(wcr->wcr_nid));
+        
+        if (info->Status == FCM_CONNECT_CANCEL)
+                return;
+        
+        LASSERT (info->Status == FCM_CONNECT_REQUEST);
+        
+        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+                CERROR ("Can't accept: bad magic %08x\n",
+                        le32_to_cpu(wcr->wcr_magic));
+                GOTO(out, reason = RC_USER_REJ);
+        }
+
+        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+                CERROR ("Can't accept: bad version %d\n",
+                        le16_to_cpu(wcr->wcr_magic));
+                GOTO(out, reason = RC_USER_REJ);
+        }
+
+        rc = kibnal_accept(&conn, cep,
+                           le64_to_cpu(wcr->wcr_nid),
+                           le64_to_cpu(wcr->wcr_incarnation),
+                           le16_to_cpu(wcr->wcr_queue_depth));
+        if (rc != 0) {
+                CERROR ("Can't accept "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), rc);
+                GOTO(out, reason = RC_NO_RESOURCES);
+        }
+
+        frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN,
+                            min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, 
+                                  ca_attr->MaxQPResponderResources),
+                            &req->PathInfo.Path,
+                            min_t(__u8, req->CEPInfo.OfferedResponderResources, 
+                                  ca_attr->MaxQPInitiatorDepth),
+                            req->CEPInfo.StartingPSN);
+
+        if (frc != FSUCCESS) {
+                CERROR ("Can't mark QP RTS/RTR  "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), frc);
+                GOTO(out, reason = RC_NO_QP);
+        }
+
+        frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Couldn't query qp attributes "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), frc);
+                GOTO(out, reason = RC_NO_QP);
+        }
+        query = &conn->ibc_qp_attrs;
+
+        PORTAL_ALLOC(rep, sizeof(*rep));
+        PORTAL_ALLOC(rcv, sizeof(*rcv));
+        if (rep == NULL || rcv == NULL) {
+                CERROR ("can't reply and receive buffers\n");
+                GOTO(out, reason = RC_INSUFFICIENT_RESP_RES);
+        }
+
+        /* don't try to deref this into the incoming wcr :) */
+        wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData;
+
+        rep->Info.Reply = (CM_REPLY_INFO) {
+                .QPN = query->QPNumber,
+                .QKey = query->Qkey,
+                .StartingPSN = query->RecvPSN,
+                .EndToEndFlowControl = query->FlowControl,
+                /* XXX Hmm. */
+                .ArbInitiatorDepth = query->InitiatorDepth,
+                .ArbResponderResources = query->ResponderResources,
+                .TargetAckDelay = 0,
+                .FailoverAccepted = 0,
+                .RnRRetryCount = req->CEPInfo.RnrRetryCount,
+        };
+                
+        *wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+        };
+
+        frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, 
+                             &conn->ibc_cep);
+
+        PORTAL_FREE(rep, sizeof(*rep));
+        PORTAL_FREE(rcv, sizeof(*rcv));
+
+        if (frc != FCM_CONNECT_ESTABLISHED) {
+                /* XXX it seems we don't call reject after this point? */
+                CERROR("iibt_cm_accept() failed: %d, aborting\n", frc);
+                rc = -ECONNABORTED;
+                goto out;
+        }
+
+        if (kibnal_set_cm_flags(conn->ibc_cep)) {
+                rc = -ECONNABORTED;
+                goto out;
+        }
+
+        CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+out:
+        if (reason) {
+                kibnal_reject(cep, reason);
+                rc = -ECONNABORTED;
+        }
+        if (conn != NULL) 
+                kibnal_connreq_done(conn, 0, rc);
+
+        return;
+}
+
+static void
+dump_path_records(PATH_RESULTS *results)
+{
+        IB_PATH_RECORD *path;
+        int i;
+
+        for(i = 0; i < results->NumPathRecords; i++) {
+                path = &results->PathRecords[i];
+                CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid "
+                       LPX64":"LPX64" pkey %x\n",
+                       i,
+                       path->SGID.Type.Global.SubnetPrefix,
+                       path->SGID.Type.Global.InterfaceID,
+                       path->DGID.Type.Global.SubnetPrefix,
+                       path->DGID.Type.Global.InterfaceID,
+                       path->P_Key);
+        }
+}
+
+static void
+kibnal_pathreq_callback (void *arg, QUERY *query, 
+                         QUERY_RESULT_VALUES *query_res)
+{
+        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+        kib_conn_t *conn = arg;
+        PATH_RESULTS *path;
+        FSTATUS frc;
+        
+        if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+                CERROR ("status %d data size %d\n", query_res->Status,
+                        query_res->ResultDataSize);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        path = (PATH_RESULTS *)query_res->QueryResult;
+
+        if (path->NumPathRecords < 1) {
+                CERROR ("expected path records: %d\n", path->NumPathRecords);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        dump_path_records(path);
+
+        /* just using the first.  this is probably a horrible idea. */
+        conn->ibc_connreq->cr_path = path->PathRecords[0];
+
+        conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE);
+        if (conn->ibc_cep == NULL) {
+                CERROR ("Can't create CEP\n");
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        if (kibnal_set_cm_flags(conn->ibc_cep)) {
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+        };
+
+        conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) {
+                .SID = conn->ibc_connreq->cr_service.RID.ServiceID,
+                .CEPInfo = (CM_CEP_INFO) { 
+                        .CaGUID = kibnal_data.kib_hca_guids[0],
+                        .EndToEndFlowControl = FALSE,
+                        .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID,
+                        .RetryCount = IBNAL_RETRY,
+                        .RnrRetryCount = IBNAL_RNR_RETRY,
+                        .AckTimeout = IBNAL_ACK_TIMEOUT,
+                        .StartingPSN = IBNAL_STARTING_PSN,
+                        .QPN = conn->ibc_qp_attrs.QPNumber,
+                        .QKey = conn->ibc_qp_attrs.Qkey,
+                        .OfferedResponderResources = ca_attr->MaxQPResponderResources,
+                        .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth,
+                },
+                .PathInfo = (CM_CEP_PATHINFO) {
+                        .bSubnetLocal = TRUE,
+                        .Path = conn->ibc_connreq->cr_path,
+                },
+        };
+
+#if 0
+        /* XXX set timeout just like SDP!!!*/
+        conn->ibc_connreq->cr_path.packet_life = 13;
+#endif
+        /* Flag I'm getting involved with the CM... */
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
+
+        CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
+               conn->ibc_connreq->cr_service.RID.ServiceID, 
+               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+        memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, 
+               CM_REQUEST_INFO_USER_LEN);
+        memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, 
+               &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
+
+        /* kibnal_cm_callback gets my conn ref */
+        frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq,
+                              kibnal_cm_callback, conn);
+        if (frc != FPENDING && frc != FSUCCESS) {
+                CERROR ("Connect: %d\n", frc);
+                /* Back out state change as connect failed */
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
+                kibnal_connreq_done (conn, 1, -EINVAL);
+        }
+}
+
+static void
+dump_service_records(SERVICE_RECORD_RESULTS *results)
+{
+        IB_SERVICE_RECORD *svc;
+        int i;
+
+        for(i = 0; i < results->NumServiceRecords; i++) {
+                svc = &results->ServiceRecords[i];
+                CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
+                       i,
+                       svc->RID.ServiceID,
+                       svc->RID.ServiceGID.Type.Global.SubnetPrefix,
+                       svc->RID.ServiceGID.Type.Global.InterfaceID,
+                       svc->RID.ServiceP_Key);
+        }
+}
+
+
+static void
+kibnal_service_get_callback (void *arg, QUERY *query, 
+                             QUERY_RESULT_VALUES *query_res)
+{
+        kib_conn_t *conn = arg;
+        SERVICE_RECORD_RESULTS *svc;
+        COMMAND_CONTROL_PARAMETERS sd_params;
+        QUERY   path_query;
+        FSTATUS frc;
+        
+        if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+                CERROR ("status %d data size %d\n", query_res->Status,
+                        query_res->ResultDataSize);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult;
+
+        if (svc->NumServiceRecords < 1) {
+                CERROR ("%d service records\n", svc->NumServiceRecords);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        dump_service_records(svc);
+
+        conn->ibc_connreq->cr_service = svc->ServiceRecords[0];
+
+        CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
+               query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, 
+               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+        memset(&path_query, 0, sizeof(path_query));
+        path_query.InputType = InputTypePortGuidPair;
+        path_query.OutputType = OutputTypePathRecord;
+        path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid;
+        path_query.InputValue.PortGuidPair.DestPortGuid  = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID;
+
+        memset(&sd_params, 0, sizeof(sd_params));
+        sd_params.RetryCount = IBNAL_RETRY;
+        sd_params.Timeout = 10 * 1000;   /* wait 10 seconds */
+
+        /* kibnal_service_get_callback gets my conn ref */
+
+        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+                                                    kibnal_data.kib_port_guid,
+                                                    &path_query, 
+                                                    kibnal_pathreq_callback,
+                                                    &sd_params, conn);
+        if (frc == FPENDING)
+                return;
+
+        CERROR ("Path record request failed: %d\n", frc);
+        kibnal_connreq_done (conn, 1, -EINVAL);
+}
+
+static void
+kibnal_connect_peer (kib_peer_t *peer)
+{
+        COMMAND_CONTROL_PARAMETERS sd_params;
+        QUERY   query;
+        FSTATUS frc;
+        kib_conn_t  *conn = kibnal_create_conn();
+
+        LASSERT (peer->ibp_connecting != 0);
+
+        if (conn == NULL) {
+                CERROR ("Can't allocate conn\n");
+                kibnal_peer_connect_failed (peer, 1, -ENOMEM);
+                return;
+        }
+
+        conn->ibc_peer = peer;
+        kib_peer_addref(peer);
+
+        PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+        if (conn->ibc_connreq == NULL) {
+                CERROR ("Can't allocate connreq\n");
+                kibnal_connreq_done (conn, 1, -ENOMEM);
+                return;
+        }
+
+        memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+
+        kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+
+        memset(&query, 0, sizeof(query));
+        query.InputType = InputTypeServiceRecord;
+        query.OutputType = OutputTypeServiceRecord;
+        query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service;
+        query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+
+        memset(&sd_params, 0, sizeof(sd_params));
+        sd_params.RetryCount = IBNAL_RETRY;
+        sd_params.Timeout = 10 * 1000;   /* wait 10 seconds */
+
+        /* kibnal_service_get_callback gets my conn ref */
+        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+                                                    kibnal_data.kib_port_guid,
+                                                    &query, 
+                                                kibnal_service_get_callback, 
+                                                    &sd_params, conn);
+        if (frc == FPENDING)
+                return;
+
+        CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc);
+        kibnal_connreq_done (conn, 1, frc);
+}
+
+static int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+        kib_tx_t          *tx;
+        struct list_head  *ttmp;
+        unsigned long      flags;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        list_for_each (ttmp, &conn->ibc_tx_queue) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_sending == 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        return 0;
+}
+
+static void
+kibnal_check_conns (int idx)
+{
+        struct list_head  *peers = &kibnal_data.kib_peers[idx];
+        struct list_head  *ptmp;
+        kib_peer_t        *peer;
+        kib_conn_t        *conn;
+        struct list_head  *ctmp;
+
+ again:
+        /* NB. We expect to have a look at all the peers and not find any
+         * rdmas to time out, so we just use a shared lock while we
+         * take a look... */
+        read_lock (&kibnal_data.kib_global_lock);
+
+        list_for_each (ptmp, peers) {
+                peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+                list_for_each (ctmp, &peer->ibp_conns) {
+                        conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+
+                        /* In case we have enough credits to return via a
+                         * NOOP, but there were no non-blocking tx descs
+                         * free to do it last time... */
+                        kibnal_check_sends(conn);
+
+                        if (!kibnal_conn_timed_out(conn))
+                                continue;
+                        
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+
+                        atomic_inc (&conn->ibc_refcount);
+                        read_unlock (&kibnal_data.kib_global_lock);
+
+                        CERROR("Timed out RDMA with "LPX64"\n",
+                               peer->ibp_nid);
+
+                        kibnal_close_conn (conn, -ETIMEDOUT);
+                        kibnal_put_conn (conn);
+
+                        /* start again now I've dropped the lock */
+                        goto again;
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+}
+
+static void
+kib_connd_handle_state(kib_conn_t *conn)
+{
+        FSTATUS frc;
+
+        switch (conn->ibc_state) {
+                /* all refs have gone, free and be done with it */ 
+                case IBNAL_CONN_DISCONNECTED:
+                        kibnal_destroy_conn (conn);
+                        return; /* avoid put_conn */
+
+                case IBNAL_CONN_SEND_DREQ:
+                        frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+                        if (frc != FSUCCESS) /* XXX do real things */
+                                CERROR("disconnect failed: %d\n", frc);
+                        conn->ibc_state = IBNAL_CONN_DREQ;
+                        break;
+
+                /* a callback got to the conn before we did */ 
+                case IBNAL_CONN_DREP:
+                        break;
+                                
+                default:
+                        CERROR ("Bad conn %p state: %d\n", conn, 
+                                conn->ibc_state);
+                        LBUG();
+                        break;
+        }
+
+        /* drop ref from close_conn */
+        kibnal_put_conn(conn);
+}
+
+int
+kibnal_connd (void *arg)
+{
+        wait_queue_t       wait;
+        unsigned long      flags;
+        kib_conn_t        *conn;
+        kib_peer_t        *peer;
+        int                timeout;
+        int                i;
+        int                peer_index = 0;
+        unsigned long      deadline = jiffies;
+        
+        kportal_daemonize ("kibnal_connd");
+        kportal_blockallsigs ();
+
+        init_waitqueue_entry (&wait, current);
+
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+        for (;;) {
+                if (!list_empty (&kibnal_data.kib_connd_conns)) {
+                        conn = list_entry (kibnal_data.kib_connd_conns.next,
+                                           kib_conn_t, ibc_list);
+                        list_del (&conn->ibc_list);
+                        
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+                        kib_connd_handle_state(conn);
+
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                        continue;
+                }
+
+                if (!list_empty (&kibnal_data.kib_connd_peers)) {
+                        peer = list_entry (kibnal_data.kib_connd_peers.next,
+                                           kib_peer_t, ibp_connd_list);
+                        
+                        list_del_init (&peer->ibp_connd_list);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+                        kibnal_connect_peer (peer);
+                        kib_peer_decref (peer);
+
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                }
+
+                /* shut down and nobody left to reap... */
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
+                        break;
+
+                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+                /* careful with the jiffy wrap... */
+                while ((timeout = (int)(deadline - jiffies)) <= 0) {
+                        const int n = 4;
+                        const int p = 1;
+                        int       chunk = kibnal_data.kib_peer_hash_size;
+                        
+                        /* Time to check for RDMA timeouts on a few more
+                         * peers: I do checks every 'p' seconds on a
+                         * proportion of the peer table and I need to check
+                         * every connection 'n' times within a timeout
+                         * interval, to ensure I detect a timeout on any
+                         * connection within (n+1)/n times the timeout
+                         * interval. */
+
+                        if (kibnal_tunables.kib_io_timeout > n * p)
+                                chunk = (chunk * n * p) / 
+                                        kibnal_tunables.kib_io_timeout;
+                        if (chunk == 0)
+                                chunk = 1;
+
+                        for (i = 0; i < chunk; i++) {
+                                kibnal_check_conns (peer_index);
+                                peer_index = (peer_index + 1) % 
+                                             kibnal_data.kib_peer_hash_size;
+                        }
+
+                        deadline += p * HZ;
+                }
+
+                kibnal_data.kib_connd_waketime = jiffies + timeout;
+
+                set_current_state (TASK_INTERRUPTIBLE);
+                add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+                if (!kibnal_data.kib_shutdown &&
+                    list_empty (&kibnal_data.kib_connd_conns) &&
+                    list_empty (&kibnal_data.kib_connd_peers))
+                        schedule_timeout (timeout);
+
+                set_current_state (TASK_RUNNING);
+                remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+                spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+        kibnal_thread_fini ();
+        return (0);
+}
+
+int
+kibnal_scheduler(void *arg)
+{
+        long            id = (long)arg;
+        char            name[16];
+        kib_rx_t       *rx;
+        kib_tx_t       *tx;
+        unsigned long   flags;
+        int             rc;
+        int             counter = 0;
+        int             did_something;
+
+        snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
+        kportal_daemonize(name);
+        kportal_blockallsigs();
+
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+
+        for (;;) {
+                did_something = 0;
+
+                while (!list_empty(&kibnal_data.kib_sched_txq)) {
+                        tx = list_entry(kibnal_data.kib_sched_txq.next,
+                                        kib_tx_t, tx_list);
+                        list_del(&tx->tx_list);
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+                        kibnal_tx_done(tx);
+
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+
+                if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+                        rx = list_entry(kibnal_data.kib_sched_rxq.next,
+                                        kib_rx_t, rx_list);
+                        list_del(&rx->rx_list);
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+
+                        kibnal_rx(rx);
+
+                        did_something = 1;
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+
+                /* shut down and no receives to complete... */
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
+                        break;
+
+                /* nothing to do or hogging CPU */
+                if (!did_something || counter++ == IBNAL_RESCHED) {
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+                        counter = 0;
+
+                        if (!did_something) {
+                                rc = wait_event_interruptible(
+                                        kibnal_data.kib_sched_waitq,
+                                        !list_empty(&kibnal_data.kib_sched_txq) || 
+                                        !list_empty(&kibnal_data.kib_sched_rxq) || 
+                                        (kibnal_data.kib_shutdown &&
+                                         atomic_read (&kibnal_data.kib_nconns) == 0));
+                        } else {
+                                our_cond_resched();
+                        }
+
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+        }
+
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+
+        kibnal_thread_fini();
+        return (0);
+}
+
+
+lib_nal_t kibnal_lib = {
+        libnal_data:        &kibnal_data,      /* NAL private data */
+        libnal_send:         kibnal_send,
+        libnal_send_pages:   kibnal_send_pages,
+        libnal_recv:         kibnal_recv,
+        libnal_recv_pages:   kibnal_recv_pages,
+        libnal_dist:         kibnal_dist
+};
similarity index 100%
rename from lustre/portals/knals/scimacnal/.cvsignore
rename to lustre/portals/knals/openibnal/.cvsignore
index 48b17e9..5ed596b 100644 (file)
@@ -1,10 +1,10 @@
 .deps
 Makefile
+.*.cmd
 autoMakefile.in
 autoMakefile
 *.ko
 *.mod.c
 .*.flags
-.*.cmd
 .tmp_versions
 .depend
index 6f66143..652eb34 100644 (file)
 
 #include "openibnal.h"
 
-nal_t                   koibnal_api;
-ptl_handle_ni_t         koibnal_ni;
-koib_data_t             koibnal_data;
-koib_tunables_t         koibnal_tunables;
+nal_t                   kibnal_api;
+ptl_handle_ni_t         kibnal_ni;
+kib_data_t              kibnal_data;
+kib_tunables_t          kibnal_tunables;
 
 #ifdef CONFIG_SYSCTL
-#define OPENIBNAL_SYSCTL        202
+#define IBNAL_SYSCTL             202
 
-#define OPENIBNAL_SYSCTL_TIMEOUT     1
-#define OPENIBNAL_SYSCTL_ZERO_COPY   2
+#define IBNAL_SYSCTL_TIMEOUT     1
 
-static ctl_table koibnal_ctl_table[] = {
-        {OPENIBNAL_SYSCTL_TIMEOUT, "timeout", 
-         &koibnal_tunables.koib_io_timeout, sizeof (int),
+static ctl_table kibnal_ctl_table[] = {
+        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
+         &kibnal_tunables.kib_io_timeout, sizeof (int),
          0644, NULL, &proc_dointvec},
         { 0 }
 };
 
-static ctl_table koibnal_top_ctl_table[] = {
-        {OPENIBNAL_SYSCTL, "openibnal", NULL, 0, 0555, koibnal_ctl_table},
+static ctl_table kibnal_top_ctl_table[] = {
+        {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
         { 0 }
 };
 #endif
@@ -66,167 +65,183 @@ print_service(struct ib_common_attrib_service *service, char *tag, int rc)
               "service id: "LPX64"\n"
               "name      : %s\n"
               "NID       : "LPX64"\n", tag, rc,
-              service->service_id, name, service->service_data64[0]);
+              service->service_id, name, 
+              *kibnal_service_nid_field(service));
 }
 
 void
-koibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
+kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
                                struct ib_common_attrib_service *service, void *arg)
 {
         *(int *)arg = status;
-        up (&koibnal_data.koib_nid_signal);
+        up (&kibnal_data.kib_nid_signal);
 }
 
+#if IBNAL_CHECK_ADVERT
+void
+kibnal_check_advert (void)
+{
+        struct ib_common_attrib_service *svc;
+        __u64   tid;
+        int     rc;
+        int     rc2;
+
+        PORTAL_ALLOC(svc, sizeof(*svc));
+        if (svc == NULL)
+                return;
+
+        memset (svc, 0, sizeof (*svc));
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+        rc = ib_service_get (kibnal_data.kib_device, 
+                             kibnal_data.kib_port,
+                             svc,
+                             KIBNAL_SERVICE_KEY_MASK,
+                             kibnal_tunables.kib_io_timeout * HZ,
+                             kibnal_service_setunset_done, &rc2, 
+                             &tid);
+
+        if (rc != 0) {
+                CERROR ("Immediate error %d checking SM service\n", rc);
+        } else {
+                down (&kibnal_data.kib_nid_signal);
+                rc = rc2;
+
+                if (rc != 0)
+                        CERROR ("Error %d checking SM service\n", rc);
+        }
+
+        PORTAL_FREE(svc, sizeof(*svc));
+}
+#endif
+
 int
-koibnal_advertise (void)
+kibnal_advertise (void)
 {
+        struct ib_common_attrib_service *svc;
         __u64   tid;
         int     rc;
         int     rc2;
 
-        LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC(svc, sizeof(*svc));
+        if (svc == NULL)
+                return (-ENOMEM);
 
-        memset (&koibnal_data.koib_service, 0, 
-                sizeof (koibnal_data.koib_service));
+        memset (svc, 0, sizeof (*svc));
         
-        koibnal_data.koib_service.service_id
-                = koibnal_data.koib_cm_service_id;
+        svc->service_id = kibnal_data.kib_service_id;
 
-        rc = ib_cached_gid_get(koibnal_data.koib_device,
-                               koibnal_data.koib_port,
+        rc = ib_cached_gid_get(kibnal_data.kib_device,
+                               kibnal_data.kib_port,
                                0,
-                               koibnal_data.koib_service.service_gid);
+                               svc->service_gid);
         if (rc != 0) {
                 CERROR ("Can't get port %d GID: %d\n",
-                        koibnal_data.koib_port, rc);
-                return (rc);
+                        kibnal_data.kib_port, rc);
+                goto out;
         }
         
-        rc = ib_cached_pkey_get(koibnal_data.koib_device,
-                                koibnal_data.koib_port,
+        rc = ib_cached_pkey_get(kibnal_data.kib_device,
+                                kibnal_data.kib_port,
                                 0,
-                                &koibnal_data.koib_service.service_pkey);
+                                &svc->service_pkey);
         if (rc != 0) {
                 CERROR ("Can't get port %d PKEY: %d\n",
-                        koibnal_data.koib_port, rc);
-                return (rc);
+                        kibnal_data.kib_port, rc);
+                goto out;
         }
         
-        koibnal_data.koib_service.service_lease = 0xffffffff;
+        svc->service_lease = 0xffffffff;
 
-        koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
 
         CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
-               koibnal_data.koib_service.service_id,
-               koibnal_data.koib_service.service_name, 
-               *koibnal_service_nid_field(&koibnal_data.koib_service));
+               svc->service_id, 
+               svc->service_name, *kibnal_service_nid_field(svc));
 
-        rc = ib_service_set (koibnal_data.koib_device,
-                             koibnal_data.koib_port,
-                             &koibnal_data.koib_service,
+        rc = ib_service_set (kibnal_data.kib_device,
+                             kibnal_data.kib_port,
+                             svc,
                              IB_SA_SERVICE_COMP_MASK_ID |
                              IB_SA_SERVICE_COMP_MASK_GID |
                              IB_SA_SERVICE_COMP_MASK_PKEY |
                              IB_SA_SERVICE_COMP_MASK_LEASE |
-                             KOIBNAL_SERVICE_KEY_MASK,
-                             koibnal_tunables.koib_io_timeout * HZ,
-                             koibnal_service_setunset_done, &rc2, &tid);
+                             KIBNAL_SERVICE_KEY_MASK,
+                             kibnal_tunables.kib_io_timeout * HZ,
+                             kibnal_service_setunset_done, &rc2, &tid);
 
-        if (rc == 0) {
-                down (&koibnal_data.koib_nid_signal);
-                rc = rc2;
+        if (rc != 0) {
+                CERROR ("Immediate error %d advertising NID "LPX64"\n",
+                        rc, kibnal_data.kib_nid);
+                goto out;
         }
-        
-        if (rc != 0)
-                CERROR ("Error %d advertising SM service\n", rc);
 
+        down (&kibnal_data.kib_nid_signal);
+
+        rc = rc2;
+        if (rc != 0)
+                CERROR ("Error %d advertising NID "LPX64"\n", 
+                        rc, kibnal_data.kib_nid);
+ out:
+        PORTAL_FREE(svc, sizeof(*svc));
         return (rc);
 }
 
-int
-koibnal_unadvertise (int expect_success)
+void
+kibnal_unadvertise (int expect_success)
 {
+        struct ib_common_attrib_service *svc;
         __u64   tid;
         int     rc;
         int     rc2;
 
-        LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
 
-        memset (&koibnal_data.koib_service, 0,
-                sizeof (koibnal_data.koib_service));
+        PORTAL_ALLOC(svc, sizeof(*svc));
+        if (svc == NULL)
+                return;
 
-        koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
+        memset (svc, 0, sizeof(*svc));
+
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
 
         CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
-               koibnal_data.koib_service.service_name,
-               *koibnal_service_nid_field(&koibnal_data.koib_service));
-
-        rc = ib_service_delete (koibnal_data.koib_device,
-                                koibnal_data.koib_port,
-                                &koibnal_data.koib_service,
-                                KOIBNAL_SERVICE_KEY_MASK,
-                                koibnal_tunables.koib_io_timeout * HZ,
-                                koibnal_service_setunset_done, &rc2, &tid);
+               svc->service_name, *kibnal_service_nid_field(svc));
+
+        rc = ib_service_delete (kibnal_data.kib_device,
+                                kibnal_data.kib_port,
+                                svc,
+                                KIBNAL_SERVICE_KEY_MASK,
+                                kibnal_tunables.kib_io_timeout * HZ,
+                                kibnal_service_setunset_done, &rc2, &tid);
         if (rc != 0) {
                 CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
-                        rc, koibnal_data.koib_nid);
-                return (rc);
+                        rc, kibnal_data.kib_nid);
+                goto out;
         }
 
-        down (&koibnal_data.koib_nid_signal);
+        down (&kibnal_data.kib_nid_signal);
         
         if ((rc2 == 0) == !!expect_success)
-                return (0);
+                goto out;                       /* success: rc == 0 */
 
         if (expect_success)
                 CERROR("Error %d unadvertising NID "LPX64"\n",
-                        rc, koibnal_data.koib_nid);
+                       rc, kibnal_data.kib_nid);
         else
                 CWARN("Removed conflicting NID "LPX64"\n",
-                      koibnal_data.koib_nid);
-
-        return (rc);
-}
-
-int
-koibnal_check_advert (void)
-{
-        __u64   tid;
-        int     rc;
-        int     rc2;
-
-        static struct ib_common_attrib_service srv;
-
-        memset (&srv, 0, sizeof (srv));
-
-        koibnal_set_service_keys(&srv, koibnal_data.koib_nid);
-
-        rc = ib_service_get (koibnal_data.koib_device, 
-                             koibnal_data.koib_port,
-                             &srv,
-                             KOIBNAL_SERVICE_KEY_MASK,
-                             koibnal_tunables.koib_io_timeout * HZ,
-                             koibnal_service_setunset_done, &rc2, 
-                             &tid);
-
-        if (rc != 0) {
-                CERROR ("Immediate error %d checking SM service\n", rc);
-        } else {
-                down (&koibnal_data.koib_nid_signal);
-                rc = rc2;
-
-                if (rc != 0)
-                        CERROR ("Error %d checking SM service\n", rc);
-        }
-
-        return (rc);
+                      kibnal_data.kib_nid);
+ out:
+        PORTAL_FREE(svc, sizeof(*svc));
 }
 
 int
-koibnal_set_mynid(ptl_nid_t nid)
+kibnal_set_mynid(ptl_nid_t nid)
 {
         struct timeval tv;
-        lib_ni_t      *ni = &koibnal_lib.libnal_ni;
+        lib_ni_t      *ni = &kibnal_lib.libnal_ni;
         int            rc;
 
         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
@@ -234,75 +249,76 @@ koibnal_set_mynid(ptl_nid_t nid)
 
         do_gettimeofday(&tv);
 
-        down (&koibnal_data.koib_nid_mutex);
+        down (&kibnal_data.kib_nid_mutex);
 
-        if (nid == koibnal_data.koib_nid) {
+        if (nid == kibnal_data.kib_nid) {
                 /* no change of NID */
-                up (&koibnal_data.koib_nid_mutex);
+                up (&kibnal_data.kib_nid_mutex);
                 return (0);
         }
 
         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
-               koibnal_data.koib_nid, nid);
+               kibnal_data.kib_nid, nid);
         
-        if (koibnal_data.koib_nid != PTL_NID_ANY) {
+        if (kibnal_data.kib_nid != PTL_NID_ANY) {
 
-                koibnal_unadvertise (1);
+                kibnal_unadvertise (1);
 
-                rc = ib_cm_listen_stop (koibnal_data.koib_listen_handle);
+                rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
                 if (rc != 0)
                         CERROR ("Error %d stopping listener\n", rc);
         }
         
-        koibnal_data.koib_nid = ni->ni_pid.nid = nid;
-        koibnal_data.koib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+        kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
         
         /* Delete all existing peers and their connections after new
          * NID/incarnation set to ensure no old connections in our brave
          * new world. */
-        koibnal_del_peer (PTL_NID_ANY, 0);
-
-        rc = 0;
-        if (koibnal_data.koib_nid != PTL_NID_ANY) {
-                /* New NID installed */
+        kibnal_del_peer (PTL_NID_ANY, 0);
 
-                /* remove any previous advert (crashed node etc) */
-                koibnal_unadvertise(0);
+        if (kibnal_data.kib_nid == PTL_NID_ANY) {
+                /* No new NID to install */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+        
+        /* remove any previous advert (crashed node etc) */
+        kibnal_unadvertise(0);
 
-                /* Assign new service number */
-                koibnal_data.koib_cm_service_id = ib_cm_service_assign();
-                CDEBUG(D_NET, "service_id "LPX64"\n", koibnal_data.koib_cm_service_id);
+        /* Assign new service number */
+        kibnal_data.kib_service_id = ib_cm_service_assign();
+        CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id);
         
-                rc = ib_cm_listen(koibnal_data.koib_cm_service_id,
-                                  TS_IB_CM_SERVICE_EXACT_MASK,
-                                  koibnal_passive_conn_callback, NULL,
-                                  &koibnal_data.koib_listen_handle);
-                if (rc != 0) {
-                        CERROR ("ib_cm_listen error: %d\n", rc);
-                        goto out;
+        rc = ib_cm_listen(kibnal_data.kib_service_id,
+                          TS_IB_CM_SERVICE_EXACT_MASK,
+                          kibnal_passive_conn_callback, NULL,
+                          &kibnal_data.kib_listen_handle);
+        if (rc == 0) {
+                rc = kibnal_advertise();
+                if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+                        kibnal_check_advert();
+#endif
+                        up (&kibnal_data.kib_nid_mutex);
+                        return (0);
                 }
 
-                rc = koibnal_advertise();
-
-                koibnal_check_advert();
-        }
-        
- out:
-        if (rc != 0) {
-                koibnal_data.koib_nid = PTL_NID_ANY;
+                ib_cm_listen_stop(kibnal_data.kib_listen_handle);
                 /* remove any peers that sprung up while I failed to
                  * advertise myself */
-                koibnal_del_peer (PTL_NID_ANY, 0);
+                kibnal_del_peer (PTL_NID_ANY, 0);
         }
-
-        up (&koibnal_data.koib_nid_mutex);
-        return (0);
+        
+        kibnal_data.kib_nid = PTL_NID_ANY;
+        up (&kibnal_data.kib_nid_mutex);
+        return (rc);
 }
 
-koib_peer_t *
-koibnal_create_peer (ptl_nid_t nid)
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
 {
-        koib_peer_t *peer;
+        kib_peer_t *peer;
 
         LASSERT (nid != PTL_NID_ANY);
 
@@ -320,20 +336,20 @@ koibnal_create_peer (ptl_nid_t nid)
         INIT_LIST_HEAD (&peer->ibp_tx_queue);
 
         peer->ibp_reconnect_time = jiffies;
-        peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
+        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
 
-        atomic_inc (&koibnal_data.koib_npeers);
+        atomic_inc (&kibnal_data.kib_npeers);
         return (peer);
 }
 
 void
-koibnal_destroy_peer (koib_peer_t *peer)
+kibnal_destroy_peer (kib_peer_t *peer)
 {
         CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
 
         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
         LASSERT (peer->ibp_persistence == 0);
-        LASSERT (!koibnal_peer_active(peer));
+        LASSERT (!kibnal_peer_active(peer));
         LASSERT (peer->ibp_connecting == 0);
         LASSERT (list_empty (&peer->ibp_conns));
         LASSERT (list_empty (&peer->ibp_tx_queue));
@@ -344,11 +360,11 @@ koibnal_destroy_peer (koib_peer_t *peer)
          * they are destroyed, so we can be assured that _all_ state to do
          * with this peer has been cleaned up when its refcount drops to
          * zero. */
-        atomic_dec (&koibnal_data.koib_npeers);
+        atomic_dec (&kibnal_data.kib_npeers);
 }
 
 void
-koibnal_put_peer (koib_peer_t *peer)
+kibnal_put_peer (kib_peer_t *peer)
 {
         CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
                 peer, peer->ibp_nid,
@@ -358,19 +374,19 @@ koibnal_put_peer (koib_peer_t *peer)
         if (!atomic_dec_and_test (&peer->ibp_refcount))
                 return;
 
-        koibnal_destroy_peer (peer);
+        kibnal_destroy_peer (peer);
 }
 
-koib_peer_t *
-koibnal_find_peer_locked (ptl_nid_t nid)
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
 {
-        struct list_head *peer_list = koibnal_nid2peerlist (nid);
+        struct list_head *peer_list = kibnal_nid2peerlist (nid);
         struct list_head *tmp;
-        koib_peer_t      *peer;
+        kib_peer_t       *peer;
 
         list_for_each (tmp, peer_list) {
 
-                peer = list_entry (tmp, koib_peer_t, ibp_list);
+                peer = list_entry (tmp, kib_peer_t, ibp_list);
 
                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
                          peer->ibp_connecting != 0 || /* creating conns */
@@ -386,46 +402,46 @@ koibnal_find_peer_locked (ptl_nid_t nid)
         return (NULL);
 }
 
-koib_peer_t *
-koibnal_get_peer (ptl_nid_t nid)
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
 {
-        koib_peer_t     *peer;
+        kib_peer_t     *peer;
 
-        read_lock (&koibnal_data.koib_global_lock);
-        peer = koibnal_find_peer_locked (nid);
+        read_lock (&kibnal_data.kib_global_lock);
+        peer = kibnal_find_peer_locked (nid);
         if (peer != NULL)                       /* +1 ref for caller? */
                 atomic_inc (&peer->ibp_refcount);
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
 
         return (peer);
 }
 
 void
-koibnal_unlink_peer_locked (koib_peer_t *peer)
+kibnal_unlink_peer_locked (kib_peer_t *peer)
 {
         LASSERT (peer->ibp_persistence == 0);
         LASSERT (list_empty(&peer->ibp_conns));
 
-        LASSERT (koibnal_peer_active(peer));
+        LASSERT (kibnal_peer_active(peer));
         list_del_init (&peer->ibp_list);
         /* lose peerlist's ref */
-        koibnal_put_peer (peer);
+        kibnal_put_peer (peer);
 }
 
 int
-koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
 {
-        koib_peer_t       *peer;
+        kib_peer_t        *peer;
         struct list_head  *ptmp;
         int                i;
 
-        read_lock (&koibnal_data.koib_global_lock);
+        read_lock (&kibnal_data.kib_global_lock);
 
-        for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
 
-                list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
                         
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -436,53 +452,53 @@ koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
                         *nidp = peer->ibp_nid;
                         *persistencep = peer->ibp_persistence;
                         
-                        read_unlock (&koibnal_data.koib_global_lock);
+                        read_unlock (&kibnal_data.kib_global_lock);
                         return (0);
                 }
         }
 
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
         return (-ENOENT);
 }
 
 int
-koibnal_add_persistent_peer (ptl_nid_t nid)
+kibnal_add_persistent_peer (ptl_nid_t nid)
 {
         unsigned long      flags;
-        koib_peer_t       *peer;
-        koib_peer_t       *peer2;
+        kib_peer_t        *peer;
+        kib_peer_t        *peer2;
         
         if (nid == PTL_NID_ANY)
                 return (-EINVAL);
 
-        peer = koibnal_create_peer (nid);
+        peer = kibnal_create_peer (nid);
         if (peer == NULL)
                 return (-ENOMEM);
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        peer2 = koibnal_find_peer_locked (nid);
+        peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
-                koibnal_put_peer (peer);
+                kibnal_put_peer (peer);
                 peer = peer2;
         } else {
                 /* peer table takes existing ref on peer */
                 list_add_tail (&peer->ibp_list,
-                               koibnal_nid2peerlist (nid));
+                               kibnal_nid2peerlist (nid));
         }
 
         peer->ibp_persistence++;
         
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
         return (0);
 }
 
 void
-koibnal_del_peer_locked (koib_peer_t *peer, int single_share)
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
 {
         struct list_head *ctmp;
         struct list_head *cnxt;
-        koib_conn_t      *conn;
+        kib_conn_t       *conn;
 
         if (!single_share)
                 peer->ibp_persistence = 0;
@@ -493,38 +509,38 @@ koibnal_del_peer_locked (koib_peer_t *peer, int single_share)
                 return;
 
         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
-                conn = list_entry(ctmp, koib_conn_t, ibc_list);
+                conn = list_entry(ctmp, kib_conn_t, ibc_list);
 
-                koibnal_close_conn_locked (conn, 0);
+                kibnal_close_conn_locked (conn, 0);
         }
 
         /* NB peer unlinks itself when last conn is closed */
 }
 
 int
-koibnal_del_peer (ptl_nid_t nid, int single_share)
+kibnal_del_peer (ptl_nid_t nid, int single_share)
 {
         unsigned long      flags;
         struct list_head  *ptmp;
         struct list_head  *pnxt;
-        koib_peer_t      *peer;
+        kib_peer_t        *peer;
         int                lo;
         int                hi;
         int                i;
         int                rc = -ENOENT;
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         if (nid != PTL_NID_ANY)
-                lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
-                hi = koibnal_data.koib_peer_hash_size - 1;
+                hi = kibnal_data.kib_peer_hash_size - 1;
         }
 
         for (i = lo; i <= hi; i++) {
-                list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -532,7 +548,7 @@ koibnal_del_peer (ptl_nid_t nid, int single_share)
                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
                                 continue;
 
-                        koibnal_del_peer_locked (peer, single_share);
+                        kibnal_del_peer_locked (peer, single_share);
                         rc = 0;         /* matched something */
 
                         if (single_share)
@@ -540,26 +556,26 @@ koibnal_del_peer (ptl_nid_t nid, int single_share)
                 }
         }
  out:
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         return (rc);
 }
 
-koib_conn_t *
-koibnal_get_conn_by_idx (int index)
+kib_conn_t *
+kibnal_get_conn_by_idx (int index)
 {
-        koib_peer_t       *peer;
+        kib_peer_t        *peer;
         struct list_head  *ptmp;
-        koib_conn_t       *conn;
+        kib_conn_t        *conn;
         struct list_head  *ctmp;
         int                i;
 
-        read_lock (&koibnal_data.koib_global_lock);
+        read_lock (&kibnal_data.kib_global_lock);
 
-        for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
-                list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
 
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence > 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -568,25 +584,25 @@ koibnal_get_conn_by_idx (int index)
                                 if (index-- > 0)
                                         continue;
 
-                                conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                                conn = list_entry (ctmp, kib_conn_t, ibc_list);
                                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                                        atomic_read (&conn->ibc_refcount));
                                 atomic_inc (&conn->ibc_refcount);
-                                read_unlock (&koibnal_data.koib_global_lock);
+                                read_unlock (&kibnal_data.kib_global_lock);
                                 return (conn);
                         }
                 }
         }
 
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
         return (NULL);
 }
 
-koib_conn_t *
-koibnal_create_conn (void)
+kib_conn_t *
+kibnal_create_conn (void)
 {
-        koib_conn_t *conn;
+        kib_conn_t  *conn;
         int          i;
         __u64        vaddr = 0;
         __u64        vaddr_base;
@@ -608,57 +624,57 @@ koibnal_create_conn (void)
         memset (conn, 0, sizeof (*conn));
 
         INIT_LIST_HEAD (&conn->ibc_tx_queue);
-        INIT_LIST_HEAD (&conn->ibc_rdma_queue);
+        INIT_LIST_HEAD (&conn->ibc_active_txs);
         spin_lock_init (&conn->ibc_lock);
         
-        atomic_inc (&koibnal_data.koib_nconns);
+        atomic_inc (&kibnal_data.kib_nconns);
         /* well not really, but I call destroy() on failure, which decrements */
 
-        PORTAL_ALLOC (conn->ibc_rxs, OPENIBNAL_RX_MSGS * sizeof (koib_rx_t));
+        PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
         if (conn->ibc_rxs == NULL)
                 goto failed;
-        memset (conn->ibc_rxs, 0, OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
+        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
-        rc = koibnal_alloc_pages(&conn->ibc_rx_pages,
-                                 OPENIBNAL_RX_MSG_PAGES,
-                                 IB_ACCESS_LOCAL_WRITE);
+        rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
+                                IBNAL_RX_MSG_PAGES,
+                                IB_ACCESS_LOCAL_WRITE);
         if (rc != 0)
                 goto failed;
 
-        vaddr_base = vaddr = conn->ibc_rx_pages->oibp_vaddr;
+        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
 
-        for (i = ipage = page_offset = 0; i < OPENIBNAL_RX_MSGS; i++) {
-                struct page *page = conn->ibc_rx_pages->oibp_pages[ipage];
-                koib_rx_t   *rx = &conn->ibc_rxs[i];
+        for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+                struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+                kib_rx_t   *rx = &conn->ibc_rxs[i];
 
                 rx->rx_conn = conn;
                 rx->rx_vaddr = vaddr;
-                rx->rx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
+                rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
                 
-                vaddr += OPENIBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + OPENIBNAL_RX_MSG_BYTES);
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
                 
-                page_offset += OPENIBNAL_MSG_SIZE;
+                page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= OPENIBNAL_RX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
                 }
         }
 
         params.qp_create = (struct ib_qp_create_param) {
                 .limit = {
                         /* Sends have an optional RDMA */
-                        .max_outstanding_send_request    = 2 * OPENIBNAL_MSG_QUEUE_SIZE,
-                        .max_outstanding_receive_request = OPENIBNAL_MSG_QUEUE_SIZE,
+                        .max_outstanding_send_request    = 2 * IBNAL_MSG_QUEUE_SIZE,
+                        .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
                         .max_send_gather_element         = 1,
                         .max_receive_scatter_element     = 1,
                 },
-                .pd              = koibnal_data.koib_pd,
-                .send_queue      = koibnal_data.koib_tx_cq,
-                .receive_queue   = koibnal_data.koib_rx_cq,
+                .pd              = kibnal_data.kib_pd,
+                .send_queue      = kibnal_data.kib_cq,
+                .receive_queue   = kibnal_data.kib_cq,
                 .send_policy     = IB_WQ_SIGNAL_SELECTABLE,
                 .receive_policy  = IB_WQ_SIGNAL_SELECTABLE,
                 .rd_domain       = 0,
@@ -673,11 +689,11 @@ koibnal_create_conn (void)
         }
         
         /* Mark QP created */
-        conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
+        conn->ibc_state = IBNAL_CONN_INIT_QP;
 
         params.qp_attr = (struct ib_qp_attribute) {
                 .state             = IB_QP_STATE_INIT,
-                .port              = koibnal_data.koib_port,
+                .port              = kibnal_data.kib_port,
                 .enable_rdma_read  = 1,
                 .enable_rdma_write = 1,
                 .valid_fields      = (IB_QP_ATTRIBUTE_STATE |
@@ -696,12 +712,12 @@ koibnal_create_conn (void)
         return (conn);
         
  failed:
-        koibnal_destroy_conn (conn);
+        kibnal_destroy_conn (conn);
         return (NULL);
 }
 
 void
-koibnal_destroy_conn (koib_conn_t *conn)
+kibnal_destroy_conn (kib_conn_t *conn)
 {
         int    rc;
         
@@ -709,21 +725,21 @@ koibnal_destroy_conn (koib_conn_t *conn)
 
         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
         LASSERT (list_empty(&conn->ibc_tx_queue));
-        LASSERT (list_empty(&conn->ibc_rdma_queue));
+        LASSERT (list_empty(&conn->ibc_active_txs));
         LASSERT (conn->ibc_nsends_posted == 0);
         LASSERT (conn->ibc_connreq == NULL);
 
         switch (conn->ibc_state) {
-        case OPENIBNAL_CONN_ZOMBIE:
+        case IBNAL_CONN_ZOMBIE:
                 /* called after connection sequence initiated */
 
-        case OPENIBNAL_CONN_INIT_QP:
+        case IBNAL_CONN_INIT_QP:
                 rc = ib_qp_destroy(conn->ibc_qp);
                 if (rc != 0)
                         CERROR("Can't destroy QP: %d\n", rc);
                 /* fall through */
                 
-        case OPENIBNAL_CONN_INIT_NOTHING:
+        case IBNAL_CONN_INIT_NOTHING:
                 break;
 
         default:
@@ -731,30 +747,30 @@ koibnal_destroy_conn (koib_conn_t *conn)
         }
 
         if (conn->ibc_rx_pages != NULL) 
-                koibnal_free_pages(conn->ibc_rx_pages);
+                kibnal_free_pages(conn->ibc_rx_pages);
         
         if (conn->ibc_rxs != NULL)
                 PORTAL_FREE(conn->ibc_rxs, 
-                            OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
+                            IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
         if (conn->ibc_peer != NULL)
-                koibnal_put_peer(conn->ibc_peer);
+                kibnal_put_peer(conn->ibc_peer);
 
         PORTAL_FREE(conn, sizeof (*conn));
 
-        atomic_dec(&koibnal_data.koib_nconns);
+        atomic_dec(&kibnal_data.kib_nconns);
         
-        if (atomic_read (&koibnal_data.koib_nconns) == 0 &&
-            koibnal_data.koib_shutdown) {
+        if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+            kibnal_data.kib_shutdown) {
                 /* I just nuked the last connection on shutdown; wake up
                  * everyone so they can exit. */
-                wake_up_all(&koibnal_data.koib_sched_waitq);
-                wake_up_all(&koibnal_data.koib_connd_waitq);
+                wake_up_all(&kibnal_data.kib_sched_waitq);
+                wake_up_all(&kibnal_data.kib_connd_waitq);
         }
 }
 
 void
-koibnal_put_conn (koib_conn_t *conn)
+kibnal_put_conn (kib_conn_t *conn)
 {
         unsigned long flags;
 
@@ -767,44 +783,44 @@ koibnal_put_conn (koib_conn_t *conn)
                 return;
 
         /* last ref only goes on zombies */
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ZOMBIE);
+        LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
 
-        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
 
-        list_add (&conn->ibc_list, &koibnal_data.koib_connd_conns);
-        wake_up (&koibnal_data.koib_connd_waitq);
+        list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
 
-        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 }
 
 int
-koibnal_close_peer_conns_locked (koib_peer_t *peer, int why)
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
 {
-        koib_conn_t        *conn;
+        kib_conn_t         *conn;
         struct list_head   *ctmp;
         struct list_head   *cnxt;
         int                 count = 0;
 
         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
-                conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
 
                 count++;
-                koibnal_close_conn_locked (conn, why);
+                kibnal_close_conn_locked (conn, why);
         }
 
         return (count);
 }
 
 int
-koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation)
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
 {
-        koib_conn_t        *conn;
+        kib_conn_t         *conn;
         struct list_head   *ctmp;
         struct list_head   *cnxt;
         int                 count = 0;
 
         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
-                conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
 
                 if (conn->ibc_incarnation == incarnation)
                         continue;
@@ -813,17 +829,17 @@ koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation)
                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
                 
                 count++;
-                koibnal_close_conn_locked (conn, -ESTALE);
+                kibnal_close_conn_locked (conn, -ESTALE);
         }
 
         return (count);
 }
 
 int
-koibnal_close_matching_conns (ptl_nid_t nid)
+kibnal_close_matching_conns (ptl_nid_t nid)
 {
         unsigned long       flags;
-        koib_peer_t        *peer;
+        kib_peer_t         *peer;
         struct list_head   *ptmp;
         struct list_head   *pnxt;
         int                 lo;
@@ -831,19 +847,19 @@ koibnal_close_matching_conns (ptl_nid_t nid)
         int                 i;
         int                 count = 0;
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         if (nid != PTL_NID_ANY)
-                lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
-                hi = koibnal_data.koib_peer_hash_size - 1;
+                hi = kibnal_data.kib_peer_hash_size - 1;
         }
 
         for (i = lo; i <= hi; i++) {
-                list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
 
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -851,11 +867,11 @@ koibnal_close_matching_conns (ptl_nid_t nid)
                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
                                 continue;
 
-                        count += koibnal_close_peer_conns_locked (peer, 0);
+                        count += kibnal_close_peer_conns_locked (peer, 0);
                 }
         }
 
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         /* wildcards always succeed */
         if (nid == PTL_NID_ANY)
@@ -865,7 +881,7 @@ koibnal_close_matching_conns (ptl_nid_t nid)
 }
 
 int
-koibnal_cmd(struct portals_cfg *pcfg, void * private)
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
 {
         int rc = -EINVAL;
 
@@ -876,8 +892,8 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
                 ptl_nid_t   nid = 0;
                 int         share_count = 0;
 
-                rc = koibnal_get_peer_info(pcfg->pcfg_count,
-                                           &nid, &share_count);
+                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                                          &nid, &share_count);
                 pcfg->pcfg_nid   = nid;
                 pcfg->pcfg_size  = 0;
                 pcfg->pcfg_id    = 0;
@@ -887,17 +903,17 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
                 break;
         }
         case NAL_CMD_ADD_PEER: {
-                rc = koibnal_add_persistent_peer (pcfg->pcfg_nid);
+                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
                 break;
         }
         case NAL_CMD_DEL_PEER: {
-                rc = koibnal_del_peer (pcfg->pcfg_nid, 
+                rc = kibnal_del_peer (pcfg->pcfg_nid, 
                                        /* flags == single_share */
                                        pcfg->pcfg_flags != 0);
                 break;
         }
         case NAL_CMD_GET_CONN: {
-                koib_conn_t *conn = koibnal_get_conn_by_idx (pcfg->pcfg_count);
+                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
 
                 if (conn == NULL)
                         rc = -ENOENT;
@@ -907,19 +923,19 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
                         pcfg->pcfg_id    = 0;
                         pcfg->pcfg_misc  = 0;
                         pcfg->pcfg_flags = 0;
-                        koibnal_put_conn (conn);
+                        kibnal_put_conn (conn);
                 }
                 break;
         }
         case NAL_CMD_CLOSE_CONNECTION: {
-                rc = koibnal_close_matching_conns (pcfg->pcfg_nid);
+                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
                 break;
         }
         case NAL_CMD_REGISTER_MYNID: {
                 if (pcfg->pcfg_nid == PTL_NID_ANY)
                         rc = -EINVAL;
                 else
-                        rc = koibnal_set_mynid (pcfg->pcfg_nid);
+                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
                 break;
         }
         }
@@ -928,47 +944,47 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
 }
 
 void
-koibnal_free_pages (koib_pages_t *p)
+kibnal_free_pages (kib_pages_t *p)
 {
-        int     npages = p->oibp_npages;
+        int     npages = p->ibp_npages;
         int     rc;
         int     i;
         
-        if (p->oibp_mapped) {
-                rc = ib_memory_deregister(p->oibp_handle);
+        if (p->ibp_mapped) {
+                rc = ib_memory_deregister(p->ibp_handle);
                 if (rc != 0)
                         CERROR ("Deregister error: %d\n", rc);
         }
         
         for (i = 0; i < npages; i++)
-                if (p->oibp_pages[i] != NULL)
-                        __free_page(p->oibp_pages[i]);
+                if (p->ibp_pages[i] != NULL)
+                        __free_page(p->ibp_pages[i]);
         
-        PORTAL_FREE (p, offsetof(koib_pages_t, oibp_pages[npages]));
+        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
 }
 
 int
-koibnal_alloc_pages (koib_pages_t **pp, int npages, int access)
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
 {
-        koib_pages_t               *p;
+        kib_pages_t                *p;
         struct ib_physical_buffer  *phys_pages;
         int                         i;
         int                         rc;
 
-        PORTAL_ALLOC(p, offsetof(koib_pages_t, oibp_pages[npages]));
+        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
                 CERROR ("Can't allocate buffer %d\n", npages);
                 return (-ENOMEM);
         }
 
-        memset (p, 0, offsetof(koib_pages_t, oibp_pages[npages]));
-        p->oibp_npages = npages;
+        memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+        p->ibp_npages = npages;
         
         for (i = 0; i < npages; i++) {
-                p->oibp_pages[i] = alloc_page (GFP_KERNEL);
-                if (p->oibp_pages[i] == NULL) {
+                p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+                if (p->ibp_pages[i] == NULL) {
                         CERROR ("Can't allocate page %d of %d\n", i, npages);
-                        koibnal_free_pages(p);
+                        kibnal_free_pages(p);
                         return (-ENOMEM);
                 }
         }
@@ -976,96 +992,96 @@ koibnal_alloc_pages (koib_pages_t **pp, int npages, int access)
         PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
         if (phys_pages == NULL) {
                 CERROR ("Can't allocate physarray for %d pages\n", npages);
-                koibnal_free_pages(p);
+                kibnal_free_pages(p);
                 return (-ENOMEM);
         }
 
         for (i = 0; i < npages; i++) {
                 phys_pages[i].size = PAGE_SIZE;
                 phys_pages[i].address =
-                        koibnal_page2phys(p->oibp_pages[i]);
+                        kibnal_page2phys(p->ibp_pages[i]);
         }
 
-        p->oibp_vaddr = 0;
-        rc = ib_memory_register_physical(koibnal_data.koib_pd,
+        p->ibp_vaddr = 0;
+        rc = ib_memory_register_physical(kibnal_data.kib_pd,
                                          phys_pages, npages,
-                                         &p->oibp_vaddr,
+                                         &p->ibp_vaddr,
                                          npages * PAGE_SIZE, 0,
                                          access,
-                                         &p->oibp_handle,
-                                         &p->oibp_lkey,
-                                         &p->oibp_rkey);
+                                         &p->ibp_handle,
+                                         &p->ibp_lkey,
+                                         &p->ibp_rkey);
         
         PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
         
         if (rc != 0) {
                 CERROR ("Error %d mapping %d pages\n", rc, npages);
-                koibnal_free_pages(p);
+                kibnal_free_pages(p);
                 return (rc);
         }
         
-        p->oibp_mapped = 1;
+        p->ibp_mapped = 1;
         *pp = p;
         return (0);
 }
 
 int
-koibnal_setup_tx_descs (void)
+kibnal_setup_tx_descs (void)
 {
         int           ipage = 0;
         int           page_offset = 0;
         __u64         vaddr;
         __u64         vaddr_base;
         struct page  *page;
-        koib_tx_t    *tx;
+        kib_tx_t     *tx;
         int           i;
         int           rc;
 
         /* pre-mapped messages are not bigger than 1 page */
-        LASSERT (OPENIBNAL_MSG_SIZE <= PAGE_SIZE);
+        LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
 
         /* No fancy arithmetic when we do the buffer calculations */
-        LASSERT (PAGE_SIZE % OPENIBNAL_MSG_SIZE == 0);
+        LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
 
-        rc = koibnal_alloc_pages(&koibnal_data.koib_tx_pages,
-                                 OPENIBNAL_TX_MSG_PAGES, 
-                                 0);            /* local read access only */
+        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
+                                IBNAL_TX_MSG_PAGES, 
+                                0);            /* local read access only */
         if (rc != 0)
                 return (rc);
 
-        vaddr = vaddr_base = koibnal_data.koib_tx_pages->oibp_vaddr;
+        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
 
-        for (i = 0; i < OPENIBNAL_TX_MSGS; i++) {
-                page = koibnal_data.koib_tx_pages->oibp_pages[ipage];
-                tx = &koibnal_data.koib_tx_descs[i];
+        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+                page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+                tx = &kibnal_data.kib_tx_descs[i];
 
                 memset (tx, 0, sizeof(*tx));    /* zero flags etc */
                 
-                tx->tx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
+                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
                 tx->tx_vaddr = vaddr;
-                tx->tx_isnblk = (i >= OPENIBNAL_NTX);
-                tx->tx_mapped = KOIB_TX_UNMAPPED;
+                tx->tx_isnblk = (i >= IBNAL_NTX);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
 
                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
                        i, tx, tx->tx_msg, tx->tx_vaddr);
 
                 if (tx->tx_isnblk)
                         list_add (&tx->tx_list, 
-                                  &koibnal_data.koib_idle_nblk_txs);
+                                  &kibnal_data.kib_idle_nblk_txs);
                 else
                         list_add (&tx->tx_list, 
-                                  &koibnal_data.koib_idle_txs);
+                                  &kibnal_data.kib_idle_txs);
 
-                vaddr += OPENIBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + OPENIBNAL_TX_MSG_BYTES);
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
 
-                page_offset += OPENIBNAL_MSG_SIZE;
+                page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= OPENIBNAL_TX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
                 }
         }
         
@@ -1073,7 +1089,7 @@ koibnal_setup_tx_descs (void)
 }
 
 void
-koibnal_api_shutdown (nal_t *nal)
+kibnal_api_shutdown (nal_t *nal)
 {
         int   i;
         int   rc;
@@ -1087,119 +1103,113 @@ koibnal_api_shutdown (nal_t *nal)
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
                atomic_read (&portal_kmemory));
 
-        LASSERT(nal == &koibnal_api);
+        LASSERT(nal == &kibnal_api);
 
-        switch (koibnal_data.koib_init) {
+        switch (kibnal_data.kib_init) {
         default:
-                CERROR ("Unexpected state %d\n", koibnal_data.koib_init);
+                CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
                 LBUG();
 
-        case OPENIBNAL_INIT_ALL:
+        case IBNAL_INIT_ALL:
                 /* stop calls to nal_cmd */
                 libcfs_nal_cmd_unregister(OPENIBNAL);
                 /* No new peers */
 
                 /* resetting my NID to unadvertises me, removes my
                  * listener and nukes all current peers */
-                koibnal_set_mynid (PTL_NID_ANY);
+                kibnal_set_mynid (PTL_NID_ANY);
 
                 /* Wait for all peer state to clean up */
                 i = 2;
-                while (atomic_read (&koibnal_data.koib_npeers) != 0) {
+                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "waiting for %d peers to close down\n",
-                               atomic_read (&koibnal_data.koib_npeers));
+                               atomic_read (&kibnal_data.kib_npeers));
                         set_current_state (TASK_INTERRUPTIBLE);
                         schedule_timeout (HZ);
                 }
                 /* fall through */
 
-        case OPENIBNAL_INIT_TX_CQ:
-                rc = ib_cq_destroy (koibnal_data.koib_tx_cq);
-                if (rc != 0)
-                        CERROR ("Destroy tx CQ error: %d\n", rc);
-                /* fall through */
-
-        case OPENIBNAL_INIT_RX_CQ:
-                rc = ib_cq_destroy (koibnal_data.koib_rx_cq);
+        case IBNAL_INIT_CQ:
+                rc = ib_cq_destroy (kibnal_data.kib_cq);
                 if (rc != 0)
-                        CERROR ("Destroy rx CQ error: %d\n", rc);
+                        CERROR ("Destroy CQ error: %d\n", rc);
                 /* fall through */
 
-        case OPENIBNAL_INIT_TXD:
-                koibnal_free_pages (koibnal_data.koib_tx_pages);
+        case IBNAL_INIT_TXD:
+                kibnal_free_pages (kibnal_data.kib_tx_pages);
                 /* fall through */
-#if OPENIBNAL_FMR
-        case OPENIBNAL_INIT_FMR:
-                rc = ib_fmr_pool_destroy (koibnal_data.koib_fmr_pool);
+#if IBNAL_FMR
+        case IBNAL_INIT_FMR:
+                rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
                 if (rc != 0)
                         CERROR ("Destroy FMR pool error: %d\n", rc);
                 /* fall through */
 #endif
-        case OPENIBNAL_INIT_PD:
-                rc = ib_pd_destroy(koibnal_data.koib_pd);
+        case IBNAL_INIT_PD:
+                rc = ib_pd_destroy(kibnal_data.kib_pd);
                 if (rc != 0)
                         CERROR ("Destroy PD error: %d\n", rc);
                 /* fall through */
 
-        case OPENIBNAL_INIT_LIB:
-                lib_fini(&koibnal_lib);
+        case IBNAL_INIT_LIB:
+                lib_fini(&kibnal_lib);
                 /* fall through */
 
-        case OPENIBNAL_INIT_DATA:
+        case IBNAL_INIT_DATA:
                 /* Module refcount only gets to zero when all peers
                  * have been closed so all lists must be empty */
-                LASSERT (atomic_read (&koibnal_data.koib_npeers) == 0);
-                LASSERT (koibnal_data.koib_peers != NULL);
-                for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
-                        LASSERT (list_empty (&koibnal_data.koib_peers[i]));
+                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (kibnal_data.kib_peers != NULL);
+                for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                        LASSERT (list_empty (&kibnal_data.kib_peers[i]));
                 }
-                LASSERT (atomic_read (&koibnal_data.koib_nconns) == 0);
-                LASSERT (list_empty (&koibnal_data.koib_sched_rxq));
-                LASSERT (list_empty (&koibnal_data.koib_sched_txq));
-                LASSERT (list_empty (&koibnal_data.koib_connd_conns));
-                LASSERT (list_empty (&koibnal_data.koib_connd_peers));
+                LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+                LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+                LASSERT (list_empty (&kibnal_data.kib_connd_peers));
 
                 /* flag threads to terminate; wake and wait for them to die */
-                koibnal_data.koib_shutdown = 1;
-                wake_up_all (&koibnal_data.koib_sched_waitq);
-                wake_up_all (&koibnal_data.koib_connd_waitq);
+                kibnal_data.kib_shutdown = 1;
+                wake_up_all (&kibnal_data.kib_sched_waitq);
+                wake_up_all (&kibnal_data.kib_connd_waitq);
 
                 i = 2;
-                while (atomic_read (&koibnal_data.koib_nthreads) != 0) {
+                while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "Waiting for %d threads to terminate\n",
-                               atomic_read (&koibnal_data.koib_nthreads));
+                               atomic_read (&kibnal_data.kib_nthreads));
                         set_current_state (TASK_INTERRUPTIBLE);
                         schedule_timeout (HZ);
                 }
                 /* fall through */
                 
-        case OPENIBNAL_INIT_NOTHING:
+        case IBNAL_INIT_NOTHING:
                 break;
         }
 
-        if (koibnal_data.koib_tx_descs != NULL)
-                PORTAL_FREE (koibnal_data.koib_tx_descs,
-                             OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
+        if (kibnal_data.kib_tx_descs != NULL)
+                PORTAL_FREE (kibnal_data.kib_tx_descs,
+                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
 
-        if (koibnal_data.koib_peers != NULL)
-                PORTAL_FREE (koibnal_data.koib_peers,
+        if (kibnal_data.kib_peers != NULL)
+                PORTAL_FREE (kibnal_data.kib_peers,
                              sizeof (struct list_head) * 
-                             koibnal_data.koib_peer_hash_size);
+                             kibnal_data.kib_peer_hash_size);
 
         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
                atomic_read (&portal_kmemory));
         printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
                atomic_read(&portal_kmemory));
 
-        koibnal_data.koib_init = OPENIBNAL_INIT_NOTHING;
+        kibnal_data.kib_init = IBNAL_INIT_NOTHING;
 }
 
 int
-koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                      ptl_ni_limits_t *requested_limits,
                      ptl_ni_limits_t *actual_limits)
 {
@@ -1208,65 +1218,66 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         int               rc;
         int               i;
 
-        LASSERT (nal == &koibnal_api);
+        LASSERT (nal == &kibnal_api);
 
         if (nal->nal_refct != 0) {
                 if (actual_limits != NULL)
-                        *actual_limits = koibnal_lib.libnal_ni.ni_actual_limits;
+                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
                 /* This module got the first ref */
                 PORTAL_MODULE_USE;
                 return (PTL_OK);
         }
 
-        LASSERT (koibnal_data.koib_init == OPENIBNAL_INIT_NOTHING);
+        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
 
-        memset (&koibnal_data, 0, sizeof (koibnal_data)); /* zero pointers, flags etc */
+        memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
 
-        init_MUTEX (&koibnal_data.koib_nid_mutex);
-        init_MUTEX_LOCKED (&koibnal_data.koib_nid_signal);
-        koibnal_data.koib_nid = PTL_NID_ANY;
+        init_MUTEX (&kibnal_data.kib_nid_mutex);
+        init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+        kibnal_data.kib_nid = PTL_NID_ANY;
 
-        rwlock_init(&koibnal_data.koib_global_lock);
+        rwlock_init(&kibnal_data.kib_global_lock);
 
-        koibnal_data.koib_peer_hash_size = OPENIBNAL_PEER_HASH_SIZE;
-        PORTAL_ALLOC (koibnal_data.koib_peers,
-                      sizeof (struct list_head) * koibnal_data.koib_peer_hash_size);
-        if (koibnal_data.koib_peers == NULL) {
+        kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+        PORTAL_ALLOC (kibnal_data.kib_peers,
+                      sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+        if (kibnal_data.kib_peers == NULL) {
                 goto failed;
         }
-        for (i = 0; i < koibnal_data.koib_peer_hash_size; i++)
-                INIT_LIST_HEAD(&koibnal_data.koib_peers[i]);
-
-        spin_lock_init (&koibnal_data.koib_connd_lock);
-        INIT_LIST_HEAD (&koibnal_data.koib_connd_peers);
-        INIT_LIST_HEAD (&koibnal_data.koib_connd_conns);
-        init_waitqueue_head (&koibnal_data.koib_connd_waitq);
-
-        spin_lock_init (&koibnal_data.koib_sched_lock);
-        INIT_LIST_HEAD (&koibnal_data.koib_sched_txq);
-        INIT_LIST_HEAD (&koibnal_data.koib_sched_rxq);
-        init_waitqueue_head (&koibnal_data.koib_sched_waitq);
-
-        spin_lock_init (&koibnal_data.koib_tx_lock);
-        INIT_LIST_HEAD (&koibnal_data.koib_idle_txs);
-        INIT_LIST_HEAD (&koibnal_data.koib_idle_nblk_txs);
-        init_waitqueue_head(&koibnal_data.koib_idle_tx_waitq);
-
-        PORTAL_ALLOC (koibnal_data.koib_tx_descs,
-                      OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
-        if (koibnal_data.koib_tx_descs == NULL) {
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+                INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+        spin_lock_init (&kibnal_data.kib_connd_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+        init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+        spin_lock_init (&kibnal_data.kib_sched_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+        init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+        spin_lock_init (&kibnal_data.kib_tx_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        if (kibnal_data.kib_tx_descs == NULL) {
                 CERROR ("Can't allocate tx descs\n");
                 goto failed;
         }
 
         /* lists/ptrs/locks initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_DATA;
+        kibnal_data.kib_init = IBNAL_INIT_DATA;
         /*****************************************************/
 
+
         process_id.pid = requested_pid;
-        process_id.nid = koibnal_data.koib_nid;
+        process_id.nid = kibnal_data.kib_nid;
         
-        rc = lib_init(&koibnal_lib, nal, process_id,
+        rc = lib_init(&kibnal_lib, nal, process_id,
                       requested_limits, actual_limits);
         if (rc != PTL_OK) {
                 CERROR("lib_init failed: error %d\n", rc);
@@ -1274,11 +1285,11 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         }
 
         /* lib interface initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_LIB;
+        kibnal_data.kib_init = IBNAL_INIT_LIB;
         /*****************************************************/
 
-        for (i = 0; i < OPENIBNAL_N_SCHED; i++) {
-                rc = koibnal_thread_start (koibnal_scheduler, (void *)i);
+        for (i = 0; i < IBNAL_N_SCHED; i++) {
+                rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
                 if (rc != 0) {
                         CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
                                i, rc);
@@ -1286,56 +1297,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 }
         }
 
-        rc = koibnal_thread_start (koibnal_connd, NULL);
+        rc = kibnal_thread_start (kibnal_connd, NULL);
         if (rc != 0) {
                 CERROR ("Can't spawn openibnal connd: %d\n", rc);
                 goto failed;
         }
 
-        koibnal_data.koib_device = ib_device_get_by_index(0);
-        if (koibnal_data.koib_device == NULL) {
+        kibnal_data.kib_device = ib_device_get_by_index(0);
+        if (kibnal_data.kib_device == NULL) {
                 CERROR ("Can't open ib device 0\n");
                 goto failed;
         }
         
-        rc = ib_device_properties_get(koibnal_data.koib_device,
-                                      &koibnal_data.koib_device_props);
+        rc = ib_device_properties_get(kibnal_data.kib_device,
+                                      &kibnal_data.kib_device_props);
         if (rc != 0) {
                 CERROR ("Can't get device props: %d\n", rc);
                 goto failed;
         }
 
         CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n", 
-               koibnal_data.koib_device_props.max_initiator_per_qp,
-               koibnal_data.koib_device_props.max_responder_per_qp);
+               kibnal_data.kib_device_props.max_initiator_per_qp,
+               kibnal_data.kib_device_props.max_responder_per_qp);
 
-        koibnal_data.koib_port = 0;
+        kibnal_data.kib_port = 0;
         for (i = 1; i <= 2; i++) {
-                rc = ib_port_properties_get(koibnal_data.koib_device, i,
-                                            &koibnal_data.koib_port_props);
+                rc = ib_port_properties_get(kibnal_data.kib_device, i,
+                                            &kibnal_data.kib_port_props);
                 if (rc == 0) {
-                        koibnal_data.koib_port = i;
+                        kibnal_data.kib_port = i;
                         break;
                 }
         }
-        if (koibnal_data.koib_port == 0) {
+        if (kibnal_data.kib_port == 0) {
                 CERROR ("Can't find a port\n");
                 goto failed;
         }
 
-        rc = ib_pd_create(koibnal_data.koib_device,
-                          NULL, &koibnal_data.koib_pd);
+        rc = ib_pd_create(kibnal_data.kib_device,
+                          NULL, &kibnal_data.kib_pd);
         if (rc != 0) {
                 CERROR ("Can't create PD: %d\n", rc);
                 goto failed;
         }
         
         /* flag PD initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_PD;
+        kibnal_data.kib_init = IBNAL_INIT_PD;
         /*****************************************************/
-#if OPENIBNAL_FMR
+#if IBNAL_FMR
         {
-                const int pool_size = OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK;
+                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
                 struct ib_fmr_pool_param params = {
                         .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
                         .access            = (IB_ACCESS_LOCAL_WRITE |
@@ -1347,8 +1358,8 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                         .flush_arg         = NULL,
                         .cache             = 1,
                 };
-                rc = ib_fmr_pool_create(koibnal_data.koib_pd, &params,
-                                        &koibnal_data.koib_fmr_pool);
+                rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
+                                        &kibnal_data.kib_fmr_pool);
                 if (rc != 0) {
                         CERROR ("Can't create FMR pool size %d: %d\n", 
                                 pool_size, rc);
@@ -1357,84 +1368,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         }
 
         /* flag FMR pool initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_FMR;
+        kibnal_data.kib_init = IBNAL_INIT_FMR;
 #endif
         /*****************************************************/
 
-        rc = koibnal_setup_tx_descs();
+        rc = kibnal_setup_tx_descs();
         if (rc != 0) {
                 CERROR ("Can't register tx descs: %d\n", rc);
                 goto failed;
         }
         
         /* flag TX descs initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_TXD;
+        kibnal_data.kib_init = IBNAL_INIT_TXD;
         /*****************************************************/
         
         {
                 struct ib_cq_callback callback = {
-                        .context        = OPENIBNAL_CALLBACK_CTXT,
+                        .context        = IBNAL_CALLBACK_CTXT,
                         .policy         = IB_CQ_PROVIDER_REARM,
                         .function       = {
-                                .entry  = koibnal_rx_callback,
+                                .entry  = kibnal_callback,
                         },
                         .arg            = NULL,
                 };
-                int  nentries = OPENIBNAL_RX_CQ_ENTRIES;
+                int  nentries = IBNAL_CQ_ENTRIES;
                 
-                rc = ib_cq_create (koibnal_data.koib_device, 
+                rc = ib_cq_create (kibnal_data.kib_device, 
                                    &nentries, &callback, NULL,
-                                   &koibnal_data.koib_rx_cq);
+                                   &kibnal_data.kib_cq);
                 if (rc != 0) {
-                        CERROR ("Can't create RX CQ: %d\n", rc);
+                        CERROR ("Can't create CQ: %d\n", rc);
                         goto failed;
                 }
 
                 /* I only want solicited events */
-                rc = ib_cq_request_notification(koibnal_data.koib_rx_cq, 1);
+                rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
                 LASSERT (rc == 0);
         }
         
-        /* flag RX CQ initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_RX_CQ;
-        /*****************************************************/
-
-        {
-                struct ib_cq_callback callback = {
-                        .context        = OPENIBNAL_CALLBACK_CTXT,
-                        .policy         = IB_CQ_PROVIDER_REARM,
-                        .function       = {
-                                .entry  = koibnal_tx_callback,
-                        },
-                        .arg            = NULL,
-                };
-                int  nentries = OPENIBNAL_TX_CQ_ENTRIES;
-                
-                rc = ib_cq_create (koibnal_data.koib_device, 
-                                   &nentries, &callback, NULL,
-                                   &koibnal_data.koib_tx_cq);
-                if (rc != 0) {
-                        CERROR ("Can't create RX CQ: %d\n", rc);
-                        goto failed;
-                }
-
-                /* I only want solicited events */
-                rc = ib_cq_request_notification(koibnal_data.koib_tx_cq, 1);
-                LASSERT (rc == 0);
-        }
-                                   
-        /* flag TX CQ initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_TX_CQ;
+        /* flag CQ initialised */
+        kibnal_data.kib_init = IBNAL_INIT_CQ;
         /*****************************************************/
         
-        rc = libcfs_nal_cmd_register(OPENIBNAL, &koibnal_cmd, NULL);
+        rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
         if (rc != 0) {
                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
                 goto failed;
         }
 
         /* flag everything initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_ALL;
+        kibnal_data.kib_init = IBNAL_INIT_ALL;
         /*****************************************************/
 
         printk(KERN_INFO "Lustre: OpenIB NAL loaded "
@@ -1443,44 +1426,44 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         return (PTL_OK);
 
  failed:
-        koibnal_api_shutdown (&koibnal_api);    
+        kibnal_api_shutdown (&kibnal_api);    
         return (PTL_FAIL);
 }
 
 void __exit
-koibnal_module_fini (void)
+kibnal_module_fini (void)
 {
 #ifdef CONFIG_SYSCTL
-        if (koibnal_tunables.koib_sysctl != NULL)
-                unregister_sysctl_table (koibnal_tunables.koib_sysctl);
+        if (kibnal_tunables.kib_sysctl != NULL)
+                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
 #endif
-        PtlNIFini(koibnal_ni);
+        PtlNIFini(kibnal_ni);
 
         ptl_unregister_nal(OPENIBNAL);
 }
 
 int __init
-koibnal_module_init (void)
+kibnal_module_init (void)
 {
         int    rc;
 
         /* the following must be sizeof(int) for proc_dointvec() */
-        LASSERT(sizeof (koibnal_tunables.koib_io_timeout) == sizeof (int));
+        LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
 
-        koibnal_api.nal_ni_init = koibnal_api_startup;
-        koibnal_api.nal_ni_fini = koibnal_api_shutdown;
+        kibnal_api.nal_ni_init = kibnal_api_startup;
+        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
 
         /* Initialise dynamic tunables to defaults once only */
-        koibnal_tunables.koib_io_timeout = OPENIBNAL_IO_TIMEOUT;
+        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
 
-        rc = ptl_register_nal(OPENIBNAL, &koibnal_api);
+        rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
         if (rc != PTL_OK) {
-                CERROR("Can't register OPENIBNAL: %d\n", rc);
+                CERROR("Can't register IBNAL: %d\n", rc);
                 return (-ENOMEM);               /* or something... */
         }
 
         /* Pure gateways want the NAL started up at module load time... */
-        rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &koibnal_ni);
+        rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
                 ptl_unregister_nal(OPENIBNAL);
                 return (-ENODEV);
@@ -1488,8 +1471,8 @@ koibnal_module_init (void)
         
 #ifdef CONFIG_SYSCTL
         /* Press on regardless even if registering sysctl doesn't work */
-        koibnal_tunables.koib_sysctl = 
-                register_sysctl_table (koibnal_top_ctl_table, 0);
+        kibnal_tunables.kib_sysctl = 
+                register_sysctl_table (kibnal_top_ctl_table, 0);
 #endif
         return (0);
 }
@@ -1498,6 +1481,6 @@ MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
 MODULE_LICENSE("GPL");
 
-module_init(koibnal_module_init);
-module_exit(koibnal_module_fini);
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
 
index 301d3ae..f0610f2 100644 (file)
@@ -48,7 +48,7 @@
 #include <linux/kmod.h>
 #include <linux/sysctl.h>
 
-#define DEBUG_SUBSYSTEM S_OPENIBNAL
+#define DEBUG_SUBSYSTEM S_IBNAL
 
 #include <linux/kp30.h>
 #include <portals/p30.h>
 #include <ts_ib_cm.h>
 #include <ts_ib_sa_client.h>
 
-#define OPENIBNAL_SERVICE_NAME   "openibnal"
+#define IBNAL_SERVICE_NAME   "openibnal"
 
 #if CONFIG_SMP
-# define OPENIBNAL_N_SCHED      num_online_cpus() /* # schedulers */
+# define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
 #else
-# define OPENIBNAL_N_SCHED      1                 /* # schedulers */
+# define IBNAL_N_SCHED      1                   /* # schedulers */
 #endif
 
-#define OPENIBNAL_MIN_RECONNECT_INTERVAL HZ       /* first failed connection retry... */
-#define OPENIBNAL_MAX_RECONNECT_INTERVAL (60*HZ)  /* ...exponentially increasing to this */
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
 
-#define OPENIBNAL_MSG_SIZE       (4<<10)          /* max size of queued messages (inc hdr) */
+#define IBNAL_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
 
-#define OPENIBNAL_MSG_QUEUE_SIZE   8              /* # messages in-flight */
-#define OPENIBNAL_CREDIT_HIGHWATER 6              /* when to eagerly return credits */
-#define OPENIBNAL_RETRY            7              /* # times to retry */
-#define OPENIBNAL_RNR_RETRY        7              /*  */
-#define OPENIBNAL_CM_RETRY         7              /* # times to retry connection */
-#define OPENIBNAL_FLOW_CONTROL     1
-#define OPENIBNAL_RESPONDER_RESOURCES 8
+#define IBNAL_MSG_QUEUE_SIZE   8                /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 6                /* when to eagerly return credits */
+#define IBNAL_RETRY            7                /* # times to retry */
+#define IBNAL_RNR_RETRY        7                /*  */
+#define IBNAL_CM_RETRY         7                /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL     1
+#define IBNAL_RESPONDER_RESOURCES 8
 
-#define OPENIBNAL_NTX             64              /* # tx descs */
-#define OPENIBNAL_NTX_NBLK        256             /* # reserved tx descs */
+#define IBNAL_NTX             64                /* # tx descs */
+#define IBNAL_NTX_NBLK        256               /* # reserved tx descs */
 
-#define OPENIBNAL_PEER_HASH_SIZE  101             /* # peer lists */
+#define IBNAL_PEER_HASH_SIZE  101               /* # peer lists */
 
-#define OPENIBNAL_RESCHED         100             /* # scheduler loops before reschedule */
+#define IBNAL_RESCHED         100               /* # scheduler loops before reschedule */
 
-#define OPENIBNAL_CONCURRENT_PEERS 1000           /* # nodes all talking at once to me */
+#define IBNAL_CONCURRENT_PEERS 1000             /* # nodes all talking at once to me */
 
 /* default vals for runtime tunables */
-#define OPENIBNAL_IO_TIMEOUT      50              /* default comms timeout (seconds) */
+#define IBNAL_IO_TIMEOUT      50                /* default comms timeout (seconds) */
 
 /************************/
 /* derived constants... */
 
 /* TX messages (shared by all connections) */
-#define OPENIBNAL_TX_MSGS       (OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK)
-#define OPENIBNAL_TX_MSG_BYTES  (OPENIBNAL_TX_MSGS * OPENIBNAL_MSG_SIZE)
-#define OPENIBNAL_TX_MSG_PAGES  ((OPENIBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-
-/* we may have up to 2 completions per transmit */
-#define OPENIBNAL_TX_CQ_ENTRIES  (2*OPENIBNAL_TX_MSGS)
+#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
 /* RX messages (per connection) */
-#define OPENIBNAL_RX_MSGS       OPENIBNAL_MSG_QUEUE_SIZE
-#define OPENIBNAL_RX_MSG_BYTES  (OPENIBNAL_RX_MSGS * OPENIBNAL_MSG_SIZE)
-#define OPENIBNAL_RX_MSG_PAGES  ((OPENIBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
-/* 1 completion per receive, per connection */
-#define OPENIBNAL_RX_CQ_ENTRIES (OPENIBNAL_RX_MSGS * OPENIBNAL_CONCURRENT_PEERS)
+/* we may have up to 2 completions per transmit +
+   1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
+                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
 
-#define OPENIBNAL_RDMA_BASE  0x0eeb0000
-#define OPENIBNAL_FMR        1
-#define OPENIBNAL_CKSUM      0
-//#define OPENIBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
-#define OPENIBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
+#define IBNAL_RDMA_BASE  0x0eeb0000
+#define IBNAL_FMR        1
+#define IBNAL_CKSUM      0
+//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
 
 typedef struct 
 {
-        int               koib_io_timeout;      /* comms timeout (seconds) */
-        struct ctl_table_header *koib_sysctl;   /* sysctl interface */
-} koib_tunables_t;
+        int               kib_io_timeout;       /* comms timeout (seconds) */
+        struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+} kib_tunables_t;
 
 typedef struct
 {
-        int               oibp_npages;          /* # pages */
-        int               oibp_mapped;          /* mapped? */
-        __u64             oibp_vaddr;           /* mapped region vaddr */
-        __u32             oibp_lkey;            /* mapped region lkey */
-        __u32             oibp_rkey;            /* mapped region rkey */
-        struct ib_mr     *oibp_handle;          /* mapped region handle */
-        struct page      *oibp_pages[0];
-} koib_pages_t;
+        int               ibp_npages;           /* # pages */
+        int               ibp_mapped;           /* mapped? */
+        __u64             ibp_vaddr;            /* mapped region vaddr */
+        __u32             ibp_lkey;             /* mapped region lkey */
+        __u32             ibp_rkey;             /* mapped region rkey */
+        struct ib_mr     *ibp_handle;           /* mapped region handle */
+        struct page      *ibp_pages[0];
+} kib_pages_t;
         
 typedef struct 
 {
-        int               koib_init;            /* initialisation state */
-        __u64             koib_incarnation;     /* which one am I */
-        int               koib_shutdown;        /* shut down? */
-        atomic_t          koib_nthreads;        /* # live threads */
-
-        __u64             koib_cm_service_id;   /* service number I listen on */
-        ptl_nid_t         koib_nid;             /* my NID */
-        struct semaphore  koib_nid_mutex;       /* serialise NID ops */
-        struct semaphore  koib_nid_signal;      /* signal completion */
-
-        rwlock_t          koib_global_lock;     /* stabilize peer/conn ops */
-
-        struct list_head *koib_peers;           /* hash table of all my known peers */
-        int               koib_peer_hash_size;  /* size of koib_peers */
-        atomic_t          koib_npeers;          /* # peers extant */
-        atomic_t          koib_nconns;          /* # connections extant */
-
-        struct list_head  koib_connd_conns;     /* connections to progress */
-        struct list_head  koib_connd_peers;     /* peers waiting for a connection */
-        wait_queue_head_t koib_connd_waitq;     /* connection daemons sleep here */
-        unsigned long     koib_connd_waketime;  /* when connd will wake */
-        spinlock_t        koib_connd_lock;      /* serialise */
-
-        wait_queue_head_t koib_sched_waitq;     /* schedulers sleep here */
-        struct list_head  koib_sched_txq;       /* tx requiring attention */
-        struct list_head  koib_sched_rxq;       /* rx requiring attention */
-        spinlock_t        koib_sched_lock;      /* serialise */
+        int               kib_init;             /* initialisation state */
+        __u64             kib_incarnation;      /* which one am I */
+        int               kib_shutdown;         /* shut down? */
+        atomic_t          kib_nthreads;         /* # live threads */
+
+        __u64             kib_service_id;       /* service number I listen on */
+        ptl_nid_t         kib_nid;              /* my NID */
+        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
+        struct semaphore  kib_nid_signal;       /* signal completion */
+
+        rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
+
+        struct list_head *kib_peers;            /* hash table of all my known peers */
+        int               kib_peer_hash_size;   /* size of kib_peers */
+        atomic_t          kib_npeers;           /* # peers extant */
+        atomic_t          kib_nconns;           /* # connections extant */
+
+        struct list_head  kib_connd_conns;      /* connections to progress */
+        struct list_head  kib_connd_peers;      /* peers waiting for a connection */
+        wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
+        unsigned long     kib_connd_waketime;   /* when connd will wake */
+        spinlock_t        kib_connd_lock;       /* serialise */
+
+        wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
+        struct list_head  kib_sched_txq;        /* tx requiring attention */
+        struct list_head  kib_sched_rxq;        /* rx requiring attention */
+        spinlock_t        kib_sched_lock;       /* serialise */
         
-        struct koib_tx   *koib_tx_descs;        /* all the tx descriptors */
-        koib_pages_t     *koib_tx_pages;        /* premapped tx msg pages */
-
-        struct list_head  koib_idle_txs;        /* idle tx descriptors */
-        struct list_head  koib_idle_nblk_txs;   /* idle reserved tx descriptors */
-        wait_queue_head_t koib_idle_tx_waitq;   /* block here for tx descriptor */
-        __u64             koib_next_tx_cookie;  /* RDMA completion cookie */
-        spinlock_t        koib_tx_lock;         /* serialise */
+        struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
+        kib_pages_t      *kib_tx_pages;         /* premapped tx msg pages */
+
+        struct list_head  kib_idle_txs;         /* idle tx descriptors */
+        struct list_head  kib_idle_nblk_txs;    /* idle reserved tx descriptors */
+        wait_queue_head_t kib_idle_tx_waitq;    /* block here for tx descriptor */
+        __u64             kib_next_tx_cookie;   /* RDMA completion cookie */
+        spinlock_t        kib_tx_lock;          /* serialise */
         
-        struct ib_device *koib_device;          /* "the" device */
-        struct ib_device_properties koib_device_props; /* its properties */
-        int               koib_port;            /* port on the device */
-        struct ib_port_properties koib_port_props; /* its properties */
-        struct ib_pd     *koib_pd;              /* protection domain */
-#if OPENIBNAL_FMR
-        struct ib_fmr_pool *koib_fmr_pool;      /* fast memory region pool */
+        struct ib_device *kib_device;           /* "the" device */
+        struct ib_device_properties kib_device_props; /* its properties */
+        int               kib_port;             /* port on the device */
+        struct ib_port_properties kib_port_props; /* its properties */
+        struct ib_pd     *kib_pd;               /* protection domain */
+#if IBNAL_FMR
+        struct ib_fmr_pool *kib_fmr_pool;       /* fast memory region pool */
 #endif
-        struct ib_cq     *koib_rx_cq;           /* receive completion queue */
-        struct ib_cq     *koib_tx_cq;           /* transmit completion queue */
-        void             *koib_listen_handle;   /* where I listen for connections */
-        struct ib_common_attrib_service koib_service; /* SM service */
+        struct ib_cq     *kib_cq;               /* completion queue */
+        void             *kib_listen_handle;    /* where I listen for connections */
         
-} koib_data_t;
-
-#define OPENIBNAL_INIT_NOTHING         0
-#define OPENIBNAL_INIT_DATA            1
-#define OPENIBNAL_INIT_LIB             2
-#define OPENIBNAL_INIT_PD              3
-#define OPENIBNAL_INIT_FMR             4
-#define OPENIBNAL_INIT_TXD             5
-#define OPENIBNAL_INIT_RX_CQ           6
-#define OPENIBNAL_INIT_TX_CQ           7
-#define OPENIBNAL_INIT_ALL             8
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING         0
+#define IBNAL_INIT_DATA            1
+#define IBNAL_INIT_LIB             2
+#define IBNAL_INIT_PD              3
+#define IBNAL_INIT_FMR             4
+#define IBNAL_INIT_TXD             5
+#define IBNAL_INIT_CQ              6
+#define IBNAL_INIT_ALL             7
 
 /************************************************************************
  * Wire message structs.
@@ -214,125 +210,125 @@ typedef struct
         __u32             md_lkey;
         __u32             md_rkey;
         __u64             md_addr;
-} koib_md_t;
+} kib_md_t;
 
 typedef struct
 {
         __u32                 rd_key;           /* remote key */
         __u32                 rd_nob;           /* # of bytes */
         __u64                 rd_addr;          /* remote io vaddr */
-} koib_rdma_desc_t;
+} kib_rdma_desc_t;
 
 
 typedef struct
 {
-        ptl_hdr_t         oibim_hdr;            /* portals header */
-        char              oibim_payload[0];     /* piggy-backed payload */
-} koib_immediate_msg_t;
+        ptl_hdr_t         ibim_hdr;             /* portals header */
+        char              ibim_payload[0];      /* piggy-backed payload */
+} kib_immediate_msg_t;
 
 typedef struct
 {
-        ptl_hdr_t         oibrm_hdr;            /* portals header */
-        __u64             oibrm_cookie;         /* opaque completion cookie */
-        koib_rdma_desc_t  oibrm_desc;           /* where to suck/blow */
-} koib_rdma_msg_t;
+        ptl_hdr_t         ibrm_hdr;             /* portals header */
+        __u64             ibrm_cookie;          /* opaque completion cookie */
+        kib_rdma_desc_t   ibrm_desc;            /* where to suck/blow */
+} kib_rdma_msg_t;
 
 typedef struct
 {
-        __u64             oibcm_cookie;         /* opaque completion cookie */
-        __u32             oibcm_status;         /* completion status */
-} koib_completion_msg_t;
+        __u64             ibcm_cookie;          /* opaque completion cookie */
+        __u32             ibcm_status;          /* completion status */
+} kib_completion_msg_t;
 
 typedef struct
 {
-        __u32              oibm_magic;          /* I'm an openibnal message */
-        __u16              oibm_version;        /* this is my version number */
-        __u8               oibm_type;           /* msg type */
-        __u8               oibm_credits;        /* returned credits */
-#if OPENIBNAL_CKSUM
-        __u32              oibm_nob;
-        __u32              oibm_cksum;
+        __u32              ibm_magic;           /* I'm an openibnal message */
+        __u16              ibm_version;         /* this is my version number */
+        __u8               ibm_type;            /* msg type */
+        __u8               ibm_credits;         /* returned credits */
+#if IBNAL_CKSUM
+        __u32              ibm_nob;
+        __u32              ibm_cksum;
 #endif
         union {
-                koib_immediate_msg_t   immediate;
-                koib_rdma_msg_t        rdma;
-                koib_completion_msg_t  completion;
-        }                    oibm_u;
-} koib_msg_t;
-
-#define OPENIBNAL_MSG_MAGIC       0x0be91b91    /* unique magic */
-#define OPENIBNAL_MSG_VERSION              1    /* current protocol version */
-
-#define OPENIBNAL_MSG_NOOP              0xd0    /* nothing (just credits) */
-#define OPENIBNAL_MSG_IMMEDIATE         0xd1    /* portals hdr + payload */
-#define OPENIBNAL_MSG_PUT_RDMA          0xd2    /* portals PUT hdr + source rdma desc */
-#define OPENIBNAL_MSG_PUT_DONE          0xd3    /* signal PUT rdma completion */
-#define OPENIBNAL_MSG_GET_RDMA          0xd4    /* portals GET hdr + sink rdma desc */
-#define OPENIBNAL_MSG_GET_DONE          0xd5    /* signal GET rdma completion */
+                kib_immediate_msg_t   immediate;
+                kib_rdma_msg_t        rdma;
+                kib_completion_msg_t  completion;
+        }                    ibm_u;
+} kib_msg_t;
+
+#define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
+#define IBNAL_MSG_VERSION              1        /* current protocol version */
+
+#define IBNAL_MSG_NOOP              0xd0        /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE         0xd1        /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA          0xd2        /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE          0xd3        /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA          0xd4        /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE          0xd5        /* signal GET rdma completion */
 
 /***********************************************************************/
 
-typedef struct koib_rx                          /* receive message */
+typedef struct kib_rx                           /* receive message */
 {
         struct list_head          rx_list;      /* queue for attention */
-        struct koib_conn         *rx_conn;      /* owning conn */
+        struct kib_conn          *rx_conn;      /* owning conn */
         int                       rx_rdma;      /* RDMA completion posted? */
         int                       rx_posted;    /* posted? */
         __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
-        koib_msg_t               *rx_msg;       /* pre-mapped buffer (host vaddr) */
+        kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
         struct ib_receive_param   rx_sp;        /* receive work item */
         struct ib_gather_scatter  rx_gl;        /* and it's memory */
-} koib_rx_t;
+} kib_rx_t;
 
-typedef struct koib_tx                          /* transmit message */
+typedef struct kib_tx                           /* transmit message */
 {
         struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
         int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
-        struct koib_conn         *tx_conn;      /* owning conn */
+        struct kib_conn          *tx_conn;      /* owning conn */
         int                       tx_mapped;    /* mapped for RDMA? */
         int                       tx_sending;   /* # tx callbacks outstanding */
         int                       tx_status;    /* completion status */
-        int                       tx_passive_rdma; /* waiting for peer to RDMA? */
-        int                       tx_passive_rdma_wait; /* on ibc_rdma_queue */
-        unsigned long             tx_passive_rdma_deadline; /* completion deadline */
+        unsigned long             tx_deadline;  /* completion deadline */
+        int                       tx_passive_rdma; /* peer sucks/blows */
+        int                       tx_passive_rdma_wait; /* waiting for peer to complete */
         __u64                     tx_passive_rdma_cookie; /* completion cookie */
         lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
-        koib_md_t                 tx_md;        /* RDMA mapping (active/passive) */
+        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
         __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
-        koib_msg_t               *tx_msg;       /* pre-mapped buffer (host vaddr) */
+        kib_msg_t                *tx_msg;       /* pre-mapped buffer (host vaddr) */
         int                       tx_nsp;       /* # send work items */
         struct ib_send_param      tx_sp[2];     /* send work items... */
         struct ib_gather_scatter  tx_gl[2];     /* ...and their memory */
-} koib_tx_t;
+} kib_tx_t;
 
-#define KOIB_TX_UNMAPPED       0
-#define KOIB_TX_MAPPED         1
-#define KOIB_TX_MAPPED_FMR     2
+#define KIB_TX_UNMAPPED       0
+#define KIB_TX_MAPPED         1
+#define KIB_TX_MAPPED_FMR     2
 
-typedef struct koib_wire_connreq
+typedef struct kib_wire_connreq
 {
         __u32        wcr_magic;                 /* I'm an openibnal connreq */
         __u16        wcr_version;               /* this is my version number */
         __u16        wcr_queue_depth;           /* this is my receive queue size */
         __u64        wcr_nid;                   /* peer's NID */
         __u64        wcr_incarnation;           /* peer's incarnation */
-} koib_wire_connreq_t;
+} kib_wire_connreq_t;
 
-typedef struct koib_connreq
+typedef struct kib_connreq
 {
         /* connection-in-progress */
-        struct koib_conn                   *cr_conn;
-        koib_wire_connreq_t                 cr_wcr;
+        struct kib_conn                    *cr_conn;
+        kib_wire_connreq_t                  cr_wcr;
         __u64                               cr_tid;
         struct ib_common_attrib_service     cr_service;
         tTS_IB_GID                          cr_gid;
         struct ib_path_record               cr_path;
         struct ib_cm_active_param           cr_connparam;
-} koib_connreq_t;
+} kib_connreq_t;
 
-typedef struct koib_conn
+typedef struct kib_conn
 { 
-        struct koib_peer   *ibc_peer;           /* owning peer */
+        struct kib_peer    *ibc_peer;           /* owning peer */
         struct list_head    ibc_list;           /* stash on peer's conn list */
         __u64               ibc_incarnation;    /* which instance of the peer */
         atomic_t            ibc_refcount;       /* # users */
@@ -342,27 +338,27 @@ typedef struct koib_conn
         int                 ibc_credits;        /* # credits I have */
         int                 ibc_outstanding_credits; /* # credits to return */
         struct list_head    ibc_tx_queue;       /* send queue */
-        struct list_head    ibc_rdma_queue;     /* tx awaiting RDMA completion */
+        struct list_head    ibc_active_txs;     /* active tx awaiting completion */
         spinlock_t          ibc_lock;           /* serialise */
-        koib_rx_t          *ibc_rxs;            /* the rx descs */
-        koib_pages_t       *ibc_rx_pages;       /* premapped rx msg pages */
+        kib_rx_t           *ibc_rxs;            /* the rx descs */
+        kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
         struct ib_qp       *ibc_qp;             /* queue pair */
         __u32               ibc_qpn;            /* queue pair number */
         tTS_IB_CM_COMM_ID   ibc_comm_id;        /* connection ID? */
-        koib_connreq_t     *ibc_connreq;        /* connection request state */
-} koib_conn_t;
+        kib_connreq_t      *ibc_connreq;        /* connection request state */
+} kib_conn_t;
 
-#define OPENIBNAL_CONN_INIT_NOTHING      0      /* initial state */
-#define OPENIBNAL_CONN_INIT_QP           1      /* ibc_qp set up */
-#define OPENIBNAL_CONN_CONNECTING        2      /* started to connect */
-#define OPENIBNAL_CONN_ESTABLISHED       3      /* connection established */
-#define OPENIBNAL_CONN_DEATHROW          4      /* waiting to be closed */
-#define OPENIBNAL_CONN_ZOMBIE            5      /* waiting to be freed */
+#define IBNAL_CONN_INIT_NOTHING      0          /* initial state */
+#define IBNAL_CONN_INIT_QP           1          /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING        2          /* started to connect */
+#define IBNAL_CONN_ESTABLISHED       3          /* connection established */
+#define IBNAL_CONN_DEATHROW          4          /* waiting to be closed */
+#define IBNAL_CONN_ZOMBIE            5          /* waiting to be freed */
 
-typedef struct koib_peer
+typedef struct kib_peer
 {
         struct list_head    ibp_list;           /* stash on global peer list */
-        struct list_head    ibp_connd_list;     /* schedule on koib_connd_peers */
+        struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
         ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
         atomic_t            ibp_refcount;       /* # users */
         int                 ibp_persistence;    /* "known" peer refs */
@@ -371,30 +367,30 @@ typedef struct koib_peer
         int                 ibp_connecting;     /* connecting+accepting */
         unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
         unsigned long       ibp_reconnect_interval; /* exponential backoff */
-} koib_peer_t;
+} kib_peer_t;
 
 
-extern lib_nal_t        koibnal_lib;
-extern koib_data_t      koibnal_data;
-extern koib_tunables_t  koibnal_tunables;
+extern lib_nal_t       kibnal_lib;
+extern kib_data_t      kibnal_data;
+extern kib_tunables_t  kibnal_tunables;
 
 static inline struct list_head *
-koibnal_nid2peerlist (ptl_nid_t nid) 
+kibnal_nid2peerlist (ptl_nid_t nid) 
 {
-        unsigned int hash = ((unsigned int)nid) % koibnal_data.koib_peer_hash_size;
+        unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
         
-        return (&koibnal_data.koib_peers [hash]);
+        return (&kibnal_data.kib_peers [hash]);
 }
 
 static inline int
-koibnal_peer_active(koib_peer_t *peer)
+kibnal_peer_active(kib_peer_t *peer)
 {
         /* Am I in the peer hash table? */
         return (!list_empty(&peer->ibp_list));
 }
 
 static inline void
-koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn)
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
 {
         /* CAVEAT EMPTOR: tx takes caller's ref on conn */
 
@@ -402,40 +398,41 @@ koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn)
         LASSERT (tx->tx_conn == NULL);          /* only set here */
 
         tx->tx_conn = conn;
+        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
         list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
 }
 
-#define KOIBNAL_SERVICE_KEY_MASK  (IB_SA_SERVICE_COMP_MASK_NAME |       \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_1 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_2 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_3 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_4 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_5 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_6 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_7 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_8)
+#define KIBNAL_SERVICE_KEY_MASK  (IB_SA_SERVICE_COMP_MASK_NAME |        \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_1 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_2 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_3 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_4 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_5 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_6 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_7 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_8)
 
 static inline __u64*
-koibnal_service_nid_field(struct ib_common_attrib_service *srv)
+kibnal_service_nid_field(struct ib_common_attrib_service *srv)
 {
-        /* must be consistent with KOIBNAL_SERVICE_KEY_MASK */
+        /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
         return (__u64 *)srv->service_data8;
 }
 
 
 static inline void
-koibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid)
+kibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid)
 {
-        LASSERT (strlen (OPENIBNAL_SERVICE_NAME) < sizeof(srv->service_name));
+        LASSERT (strlen (IBNAL_SERVICE_NAME) < sizeof(srv->service_name));
         memset (srv->service_name, 0, sizeof(srv->service_name));
-        strcpy (srv->service_name, OPENIBNAL_SERVICE_NAME);
+        strcpy (srv->service_name, IBNAL_SERVICE_NAME);
 
-        *koibnal_service_nid_field(srv) = cpu_to_le64(nid);
+        *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
 }
 
 #if 0
 static inline void
-koibnal_show_rdma_attr (koib_conn_t *conn)
+kibnal_show_rdma_attr (kib_conn_t *conn)
 {
         struct ib_qp_attribute qp_attr;
         int                    rc;
@@ -457,7 +454,7 @@ koibnal_show_rdma_attr (koib_conn_t *conn)
 
 #if CONFIG_X86
 static inline __u64
-koibnal_page2phys (struct page *p)
+kibnal_page2phys (struct page *p)
 {
         __u64 page_number = p - mem_map;
         
@@ -467,42 +464,69 @@ koibnal_page2phys (struct page *p)
 # error "no page->phys"
 #endif
 
-extern koib_peer_t *koibnal_create_peer (ptl_nid_t nid);
-extern void koibnal_put_peer (koib_peer_t *peer);
-extern int koibnal_del_peer (ptl_nid_t nid, int single_share);
-extern koib_peer_t *koibnal_find_peer_locked (ptl_nid_t nid);
-extern void koibnal_unlink_peer_locked (koib_peer_t *peer);
-extern int  koibnal_close_stale_conns_locked (koib_peer_t *peer, 
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive.  It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+        unsigned long lptr = (unsigned long)ptr;
+
+        LASSERT ((lptr & 1) == 0);
+        return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+        return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+        return (wreqid & 1) != 0;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_put_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer, 
                                               __u64 incarnation);
-extern koib_conn_t *koibnal_create_conn (void);
-extern void koibnal_put_conn (koib_conn_t *conn);
-extern void koibnal_destroy_conn (koib_conn_t *conn);
-extern int koibnal_alloc_pages (koib_pages_t **pp, int npages, int access);
-extern void koibnal_free_pages (koib_pages_t *p);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
 
-extern void koibnal_check_sends (koib_conn_t *conn);
+extern void kibnal_check_sends (kib_conn_t *conn);
 
 extern tTS_IB_CM_CALLBACK_RETURN
-koibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
+kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
                        void *param, void *arg);
 extern tTS_IB_CM_CALLBACK_RETURN 
-koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
+kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
                                void *param, void *arg);
 
-extern void koibnal_close_conn_locked (koib_conn_t *conn, int error);
-extern void koibnal_destroy_conn (koib_conn_t *conn);
-extern int  koibnal_thread_start (int (*fn)(void *arg), void *arg);
-extern int  koibnal_scheduler(void *arg);
-extern int  koibnal_connd (void *arg);
-extern void koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
-extern void koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
-extern void koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob);
-extern int  koibnal_close_conn (koib_conn_t *conn, int why);
-extern void koibnal_start_active_rdma (int type, int status
-                                       koib_rx_t *rx, lib_msg_t *libmsg
-                                       unsigned int niov, 
-                                       struct iovec *iov, ptl_kiov_t *kiov,
-                                       size_t offset, size_t nob);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int  kibnal_scheduler(void *arg);
+extern int  kibnal_connd (void *arg);
+extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern int  kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status, 
+                                      kib_rx_t *rx, lib_msg_t *libmsg
+                                      unsigned int niov
+                                      struct iovec *iov, ptl_kiov_t *kiov,
+                                      size_t offset, size_t nob);
+
 
 
 
index 79bf37a..d774853 100644 (file)
  *
  */
 void
-koibnal_schedule_tx_done (koib_tx_t *tx)
+kibnal_schedule_tx_done (kib_tx_t *tx)
 {
         unsigned long flags;
 
-        spin_lock_irqsave (&koibnal_data.koib_sched_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
 
-        list_add_tail(&tx->tx_list, &koibnal_data.koib_sched_txq);
-        wake_up (&koibnal_data.koib_sched_waitq);
+        list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+        wake_up (&kibnal_data.kib_sched_waitq);
 
-        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
 }
 
 void
-koibnal_tx_done (koib_tx_t *tx)
+kibnal_tx_done (kib_tx_t *tx)
 {
         ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
         unsigned long    flags;
@@ -49,31 +49,31 @@ koibnal_tx_done (koib_tx_t *tx)
         int              rc;
 
         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
-        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be on ibc_rdma_queue */
+        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
 
         switch (tx->tx_mapped) {
         default:
                 LBUG();
 
-        case KOIB_TX_UNMAPPED:
+        case KIB_TX_UNMAPPED:
                 break;
                 
-        case KOIB_TX_MAPPED:
+        case KIB_TX_MAPPED:
                 if (in_interrupt()) {
                         /* can't deregister memory in IRQ context... */
-                        koibnal_schedule_tx_done(tx);
+                        kibnal_schedule_tx_done(tx);
                         return;
                 }
                 rc = ib_memory_deregister(tx->tx_md.md_handle.mr);
                 LASSERT (rc == 0);
-                tx->tx_mapped = KOIB_TX_UNMAPPED;
+                tx->tx_mapped = KIB_TX_UNMAPPED;
                 break;
 
-#if OPENIBNAL_FMR
-        case KOIB_TX_MAPPED_FMR:
+#if IBNAL_FMR
+        case KIB_TX_MAPPED_FMR:
                 if (in_interrupt() && tx->tx_status != 0) {
                         /* can't flush FMRs in IRQ context... */
-                        koibnal_schedule_tx_done(tx);
+                        kibnal_schedule_tx_done(tx);
                         return;
                 }              
 
@@ -81,8 +81,8 @@ koibnal_tx_done (koib_tx_t *tx)
                 LASSERT (rc == 0);
 
                 if (tx->tx_status != 0)
-                        ib_fmr_pool_force_flush(koibnal_data.koib_fmr_pool);
-                tx->tx_mapped = KOIB_TX_UNMAPPED;
+                        ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
                 break;
 #endif
         }
@@ -92,12 +92,12 @@ koibnal_tx_done (koib_tx_t *tx)
                 if (tx->tx_libmsg[i] == NULL)
                         continue;
 
-                lib_finalize (&koibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+                lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
                 tx->tx_libmsg[i] = NULL;
         }
         
         if (tx->tx_conn != NULL) {
-                koibnal_put_conn (tx->tx_conn);
+                kibnal_put_conn (tx->tx_conn);
                 tx->tx_conn = NULL;
         }
 
@@ -105,52 +105,52 @@ koibnal_tx_done (koib_tx_t *tx)
         tx->tx_passive_rdma = 0;
         tx->tx_status = 0;
 
-        spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
 
         if (tx->tx_isnblk) {
-                list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_nblk_txs);
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
         } else {
-                list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_txs);
-                wake_up (&koibnal_data.koib_idle_tx_waitq);
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+                wake_up (&kibnal_data.kib_idle_tx_waitq);
         }
 
-        spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
 }
 
-koib_tx_t *
-koibnal_get_idle_tx (int may_block) 
+kib_tx_t *
+kibnal_get_idle_tx (int may_block) 
 {
-        unsigned long    flags;
-        koib_tx_t    *tx = NULL;
+        unsigned long  flags;
+        kib_tx_t      *tx = NULL;
         
         for (;;) {
-                spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags);
+                spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
 
                 /* "normal" descriptor is free */
-                if (!list_empty (&koibnal_data.koib_idle_txs)) {
-                        tx = list_entry (koibnal_data.koib_idle_txs.next,
-                                         koib_tx_t, tx_list);
+                if (!list_empty (&kibnal_data.kib_idle_txs)) {
+                        tx = list_entry (kibnal_data.kib_idle_txs.next,
+                                         kib_tx_t, tx_list);
                         break;
                 }
 
                 if (!may_block) {
                         /* may dip into reserve pool */
-                        if (list_empty (&koibnal_data.koib_idle_nblk_txs)) {
+                        if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
                                 CERROR ("reserved tx desc pool exhausted\n");
                                 break;
                         }
 
-                        tx = list_entry (koibnal_data.koib_idle_nblk_txs.next,
-                                         koib_tx_t, tx_list);
+                        tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+                                         kib_tx_t, tx_list);
                         break;
                 }
 
                 /* block for idle tx */
-                spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+                spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
 
-                wait_event (koibnal_data.koib_idle_tx_waitq,
-                            !list_empty (&koibnal_data.koib_idle_txs) ||
-                            koibnal_data.koib_shutdown);
+                wait_event (kibnal_data.kib_idle_tx_waitq,
+                            !list_empty (&kibnal_data.kib_idle_txs) ||
+                            kibnal_data.kib_shutdown);
         }
 
         if (tx != NULL) {
@@ -159,9 +159,9 @@ koibnal_get_idle_tx (int may_block)
                 /* Allocate a new passive RDMA completion cookie.  It might
                  * not be needed, but we've got a lock right now and we're
                  * unlikely to wrap... */
-                tx->tx_passive_rdma_cookie = koibnal_data.koib_next_tx_cookie++;
+                tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
 
-                LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
                 LASSERT (tx->tx_nsp == 0);
                 LASSERT (tx->tx_sending == 0);
                 LASSERT (tx->tx_status == 0);
@@ -172,15 +172,15 @@ koibnal_get_idle_tx (int may_block)
                 LASSERT (tx->tx_libmsg[1] == NULL);
         }
 
-        spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
         
         return (tx);
 }
 
 int
-koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
 {
-        /* I would guess that if koibnal_get_peer (nid) == NULL,
+        /* I would guess that if kibnal_get_peer (nid) == NULL,
            and we're not routing, then 'nid' is very distant :) */
         if ( nal->libnal_ni.ni_pid.nid == nid ) {
                 *dist = 0;
@@ -192,7 +192,7 @@ koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
 }
 
 void
-koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
 {
         struct list_head *ttmp;
         unsigned long     flags;
@@ -200,30 +200,34 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
-        list_for_each (ttmp, &conn->ibc_rdma_queue) {
-                koib_tx_t *tx = list_entry(ttmp, koib_tx_t, tx_list);
-                
-                LASSERT (tx->tx_passive_rdma);
-                LASSERT (tx->tx_passive_rdma_wait);
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
 
-                if (tx->tx_passive_rdma_cookie != cookie)
-                        continue;
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
 
-                CDEBUG(D_NET, "Complete %p "LPD64"\n", tx, cookie);
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
 
-                list_del (&tx->tx_list);
+                if (!tx->tx_passive_rdma_wait ||
+                    tx->tx_passive_rdma_cookie != cookie)
+                        continue;
+
+                CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
 
+                tx->tx_status = status;
                 tx->tx_passive_rdma_wait = 0;
                 idle = (tx->tx_sending == 0);
 
-                tx->tx_status = status;
+                if (idle)
+                        list_del (&tx->tx_list);
 
                 spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
                 /* I could be racing with tx callbacks.  It's whoever
                  * _makes_ tx idle that frees it */
                 if (idle)
-                        koibnal_tx_done (tx);
+                        kibnal_tx_done (tx);
                 return;
         }
                 
@@ -234,32 +238,32 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
 }
 
 void
-koibnal_post_rx (koib_rx_t *rx, int do_credits)
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
 {
-        koib_conn_t  *conn = rx->rx_conn;
+        kib_conn_t   *conn = rx->rx_conn;
         int           rc;
         unsigned long flags;
 
         rx->rx_gl = (struct ib_gather_scatter) {
                 .address = rx->rx_vaddr,
-                .length  = OPENIBNAL_MSG_SIZE,
-                .key     = conn->ibc_rx_pages->oibp_lkey,
+                .length  = IBNAL_MSG_SIZE,
+                .key     = conn->ibc_rx_pages->ibp_lkey,
         };
-        
+
         rx->rx_sp = (struct ib_receive_param) {
-                .work_request_id        = (__u64)(unsigned long)rx,
+                .work_request_id        = kibnal_ptr2wreqid(rx, 1),
                 .scatter_list           = &rx->rx_gl,
                 .num_scatter_entries    = 1,
                 .device_specific        = NULL,
                 .signaled               = 1,
         };
 
-        LASSERT (conn->ibc_state >= OPENIBNAL_CONN_ESTABLISHED);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
         LASSERT (!rx->rx_posted);
         rx->rx_posted = 1;
         mb();
 
-        if (conn->ibc_state != OPENIBNAL_CONN_ESTABLISHED)
+        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
                 rc = -ECONNABORTED;
         else
                 rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1);
@@ -270,26 +274,26 @@ koibnal_post_rx (koib_rx_t *rx, int do_credits)
                         conn->ibc_outstanding_credits++;
                         spin_unlock_irqrestore(&conn->ibc_lock, flags);
 
-                        koibnal_check_sends(conn);
+                        kibnal_check_sends(conn);
                 }
                 return;
         }
 
-        if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
                 CERROR ("Error posting receive -> "LPX64": %d\n",
                         conn->ibc_peer->ibp_nid, rc);
-                koibnal_close_conn (rx->rx_conn, rc);
+                kibnal_close_conn (rx->rx_conn, rc);
         } else {
                 CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
                         conn->ibc_peer->ibp_nid, rc);
         }
 
         /* Drop rx's ref */
-        koibnal_put_conn (conn);
+        kibnal_put_conn (conn);
 }
 
-#if OPENIBNAL_CKSUM
-__u32 koibnal_cksum (void *ptr, int nob)
+#if IBNAL_CKSUM
+__u32 kibnal_cksum (void *ptr, int nob)
 {
         char  *c  = ptr;
         __u32  sum = 0;
@@ -302,17 +306,17 @@ __u32 koibnal_cksum (void *ptr, int nob)
 #endif
 
 void
-koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_rx_callback (struct ib_cq_entry *e)
 {
-        koib_rx_t    *rx = (koib_rx_t *)((unsigned long)e->work_request_id);
-        koib_msg_t   *msg = rx->rx_msg;
-        koib_conn_t  *conn = rx->rx_conn;
+        kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id);
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
         int           nob = e->bytes_transferred;
-        const int     base_nob = offsetof(koib_msg_t, oibm_u);
+        const int     base_nob = offsetof(kib_msg_t, ibm_u);
         int           credits;
         int           flipped;
         unsigned long flags;
-#if OPENIBNAL_CKSUM
+#if IBNAL_CKSUM
         __u32         msg_cksum;
         __u32         computed_cksum;
 #endif
@@ -324,11 +328,11 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
 
         /* receives complete with error in any case after we've started
          * closing the QP */
-        if (conn->ibc_state >= OPENIBNAL_CONN_DEATHROW)
+        if (conn->ibc_state >= IBNAL_CONN_DEATHROW)
                 goto failed;
 
         /* We don't post receives until the conn is established */
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED);
+        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
 
         if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
                 CERROR("Rx from "LPX64" failed: %d\n", 
@@ -344,35 +348,35 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
 
         /* Receiver does any byte flipping if necessary... */
 
-        if (msg->oibm_magic == OPENIBNAL_MSG_MAGIC) {
+        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
                 flipped = 0;
         } else {
-                if (msg->oibm_magic != __swab32(OPENIBNAL_MSG_MAGIC)) {
+                if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
                         CERROR ("Unrecognised magic: %08x from "LPX64"\n", 
-                                msg->oibm_magic, conn->ibc_peer->ibp_nid);
+                                msg->ibm_magic, conn->ibc_peer->ibp_nid);
                         goto failed;
                 }
                 flipped = 1;
-                __swab16s (&msg->oibm_version);
-                LASSERT (sizeof(msg->oibm_type) == 1);
-                LASSERT (sizeof(msg->oibm_credits) == 1);
+                __swab16s (&msg->ibm_version);
+                LASSERT (sizeof(msg->ibm_type) == 1);
+                LASSERT (sizeof(msg->ibm_credits) == 1);
         }
 
-        if (msg->oibm_version != OPENIBNAL_MSG_VERSION) {
+        if (msg->ibm_version != IBNAL_MSG_VERSION) {
                 CERROR ("Incompatible msg version %d (%d expected)\n",
-                        msg->oibm_version, OPENIBNAL_MSG_VERSION);
+                        msg->ibm_version, IBNAL_MSG_VERSION);
                 goto failed;
         }
 
-#if OPENIBNAL_CKSUM
-        if (nob != msg->oibm_nob) {
-                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->oibm_nob);
+#if IBNAL_CKSUM
+        if (nob != msg->ibm_nob) {
+                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
                 goto failed;
         }
 
-        msg_cksum = le32_to_cpu(msg->oibm_cksum);
-        msg->oibm_cksum = 0;
-        computed_cksum = koibnal_cksum (msg, nob);
+        msg_cksum = le32_to_cpu(msg->ibm_cksum);
+        msg->ibm_cksum = 0;
+        computed_cksum = kibnal_cksum (msg, nob);
         
         if (msg_cksum != computed_cksum) {
                 CERROR ("Checksum failure %d: (%d expected)\n",
@@ -383,101 +387,101 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
 #endif
 
         /* Have I received credits that will let me send? */
-        credits = msg->oibm_credits;
+        credits = msg->ibm_credits;
         if (credits != 0) {
                 spin_lock_irqsave(&conn->ibc_lock, flags);
                 conn->ibc_credits += credits;
                 spin_unlock_irqrestore(&conn->ibc_lock, flags);
                 
-                koibnal_check_sends(conn);
+                kibnal_check_sends(conn);
         }
 
-        switch (msg->oibm_type) {
-        case OPENIBNAL_MSG_NOOP:
-                koibnal_post_rx (rx, 1);
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_NOOP:
+                kibnal_post_rx (rx, 1);
                 return;
 
-        case OPENIBNAL_MSG_IMMEDIATE:
-                if (nob < base_nob + sizeof (koib_immediate_msg_t)) {
+        case IBNAL_MSG_IMMEDIATE:
+                if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
                         CERROR ("Short IMMEDIATE from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid, nob);
                         goto failed;
                 }
                 break;
                 
-        case OPENIBNAL_MSG_PUT_RDMA:
-        case OPENIBNAL_MSG_GET_RDMA:
-                if (nob < base_nob + sizeof (koib_rdma_msg_t)) {
+        case IBNAL_MSG_PUT_RDMA:
+        case IBNAL_MSG_GET_RDMA:
+                if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
                         CERROR ("Short RDMA msg from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid, nob);
                         goto failed;
                 }
                 if (flipped) {
-                        __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_key);
-                        __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_nob);
-                        __swab64s(&msg->oibm_u.rdma.oibrm_desc.rd_addr);
+                        __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
+                        __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
+                        __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
                 }
                 CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n",
-                       msg->oibm_type, msg->oibm_u.rdma.oibrm_cookie,
-                       msg->oibm_u.rdma.oibrm_desc.rd_key,
-                       msg->oibm_u.rdma.oibrm_desc.rd_addr,
-                       msg->oibm_u.rdma.oibrm_desc.rd_nob);
+                       msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie,
+                       msg->ibm_u.rdma.ibrm_desc.rd_key,
+                       msg->ibm_u.rdma.ibrm_desc.rd_addr,
+                       msg->ibm_u.rdma.ibrm_desc.rd_nob);
                 break;
                 
-        case OPENIBNAL_MSG_PUT_DONE:
-        case OPENIBNAL_MSG_GET_DONE:
-                if (nob < base_nob + sizeof (koib_completion_msg_t)) {
+        case IBNAL_MSG_PUT_DONE:
+        case IBNAL_MSG_GET_DONE:
+                if (nob < base_nob + sizeof (kib_completion_msg_t)) {
                         CERROR ("Short COMPLETION msg from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid, nob);
                         goto failed;
                 }
                 if (flipped)
-                        __swab32s(&msg->oibm_u.completion.oibcm_status);
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
                 
                 CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
-                       msg->oibm_type, msg->oibm_u.completion.oibcm_cookie,
-                       msg->oibm_u.completion.oibcm_status);
+                       msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+                       msg->ibm_u.completion.ibcm_status);
 
-                koibnal_complete_passive_rdma (conn, 
-                                               msg->oibm_u.completion.oibcm_cookie,
-                                               msg->oibm_u.completion.oibcm_status);
-                koibnal_post_rx (rx, 1);
+                kibnal_complete_passive_rdma (conn, 
+                                              msg->ibm_u.completion.ibcm_cookie,
+                                              msg->ibm_u.completion.ibcm_status);
+                kibnal_post_rx (rx, 1);
                 return;
                         
         default:
                 CERROR ("Can't parse type from "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, msg->oibm_type);
+                        conn->ibc_peer->ibp_nid, msg->ibm_type);
                 goto failed;
         }
 
-        /* schedule for koibnal_rx() in thread context */
-        spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags);
+        /* schedule for kibnal_rx() in thread context */
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
         
-        list_add_tail (&rx->rx_list, &koibnal_data.koib_sched_rxq);
-        wake_up (&koibnal_data.koib_sched_waitq);
+        list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+        wake_up (&kibnal_data.kib_sched_waitq);
         
-        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
         return;
         
  failed:
         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
-        koibnal_close_conn(conn, -ECONNABORTED);
+        kibnal_close_conn(conn, -ECONNABORTED);
 
         /* Don't re-post rx & drop its ref on conn */
-        koibnal_put_conn(conn);
+        kibnal_put_conn(conn);
 }
 
 void
-koibnal_rx (koib_rx_t *rx)
+kibnal_rx (kib_rx_t *rx)
 {
-        koib_msg_t   *msg = rx->rx_msg;
+        kib_msg_t   *msg = rx->rx_msg;
 
         /* Clear flag so I can detect if I've sent an RDMA completion */
         rx->rx_rdma = 0;
 
-        switch (msg->oibm_type) {
-        case OPENIBNAL_MSG_GET_RDMA:
-                lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx);
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_GET_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
                 /* If the incoming get was matched, I'll have initiated the
                  * RDMA and the completion message... */
                 if (rx->rx_rdma)
@@ -487,12 +491,12 @@ koibnal_rx (koib_rx_t *rx)
                  * the peer's GET blocking for the full timeout. */
                 CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
                         rx->rx_conn->ibc_peer->ibp_nid);
-                koibnal_start_active_rdma (OPENIBNAL_MSG_GET_DONE, -EIO,
-                                           rx, NULL, 0, NULL, NULL, 0, 0);
+                kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+                                          rx, NULL, 0, NULL, NULL, 0, 0);
                 break;
                 
-        case OPENIBNAL_MSG_PUT_RDMA:
-                lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx);
+        case IBNAL_MSG_PUT_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
                 if (rx->rx_rdma)
                         break;
                 /* This is most unusual, since even if lib_parse() didn't
@@ -505,8 +509,8 @@ koibnal_rx (koib_rx_t *rx)
                         rx->rx_conn->ibc_peer->ibp_nid);
                 break;
 
-        case OPENIBNAL_MSG_IMMEDIATE:
-                lib_parse(&koibnal_lib, &msg->oibm_u.immediate.oibim_hdr, rx);
+        case IBNAL_MSG_IMMEDIATE:
+                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
                 LASSERT (!rx->rx_rdma);
                 break;
                 
@@ -515,12 +519,12 @@ koibnal_rx (koib_rx_t *rx)
                 break;
         }
 
-        koibnal_post_rx (rx, 1);
+        kibnal_post_rx (rx, 1);
 }
 
 #if 0
 int
-koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
+kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
 {
         struct page *page;
 
@@ -531,7 +535,7 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
         else if (vaddr >= PKMAP_BASE &&
                  vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
                 page = vmalloc_to_page ((void *)vaddr);
-                /* in 2.4 ^ just walks the page tables */
+        /* in 2.4 ^ just walks the page tables */
 #endif
         else
                 page = virt_to_page (vaddr);
@@ -540,13 +544,13 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
             !VALID_PAGE (page))
                 return (-EFAULT);
 
-        *physp = koibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
+        *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
         return (0);
 }
 #endif
 
 int
-koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
+kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access,
                  int niov, struct iovec *iov, int offset, int nob)
                  
 {
@@ -555,7 +559,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
 
         LASSERT (nob > 0);
         LASSERT (niov > 0);
-        LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
 
         while (offset >= iov->iov_len) {
                 offset -= iov->iov_len;
@@ -572,7 +576,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
         vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
         tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
 
-        rc = ib_memory_register (koibnal_data.koib_pd,
+        rc = ib_memory_register (kibnal_data.kib_pd,
                                  vaddr, nob,
                                  access,
                                  &tx->tx_md.md_handle.mr,
@@ -584,21 +588,21 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
                 return (rc);
         }
 
-        tx->tx_mapped = KOIB_TX_MAPPED;
+        tx->tx_mapped = KIB_TX_MAPPED;
         return (0);
 }
 
 int
-koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
+kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
                   int nkiov, ptl_kiov_t *kiov,
                   int offset, int nob)
 {
-#if OPENIBNAL_FMR
+#if IBNAL_FMR
         __u64                      *phys;
-        const int                   mapped = KOIB_TX_MAPPED_FMR;
+        const int                   mapped = KIB_TX_MAPPED_FMR;
 #else
         struct ib_physical_buffer  *phys;
-        const int                   mapped = KOIB_TX_MAPPED;
+        const int                   mapped = KIB_TX_MAPPED;
 #endif
         int                         page_offset;
         int                         nphys;
@@ -610,7 +614,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
 
         LASSERT (nob > 0);
         LASSERT (nkiov > 0);
-        LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
 
         while (offset >= kiov->kiov_len) {
                 offset -= kiov->kiov_len;
@@ -627,10 +631,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
         }
 
         page_offset = kiov->kiov_offset + offset;
-#if OPENIBNAL_FMR
-        phys[0] = koibnal_page2phys(kiov->kiov_page);
+#if IBNAL_FMR
+        phys[0] = kibnal_page2phys(kiov->kiov_page);
 #else
-        phys[0].address = koibnal_page2phys(kiov->kiov_page);
+        phys[0].address = kibnal_page2phys(kiov->kiov_page);
         phys[0].size = PAGE_SIZE;
 #endif
         nphys = 1;
@@ -667,10 +671,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
                 }
 
                 LASSERT (nphys * sizeof (*phys) < phys_size);
-#if OPENIBNAL_FMR
-                phys[nphys] = koibnal_page2phys(kiov->kiov_page);
+#if IBNAL_FMR
+                phys[nphys] = kibnal_page2phys(kiov->kiov_page);
 #else
-                phys[nphys].address = koibnal_page2phys(kiov->kiov_page);
+                phys[nphys].address = kibnal_page2phys(kiov->kiov_page);
                 phys[nphys].size = PAGE_SIZE;
 #endif
                 nphys++;
@@ -683,10 +687,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
         for (rc = 0; rc < nphys; rc++)
                 CWARN ("   [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size);
 #endif
-        tx->tx_md.md_addr = OPENIBNAL_RDMA_BASE;
+        tx->tx_md.md_addr = IBNAL_RDMA_BASE;
 
-#if OPENIBNAL_FMR
-        rc = ib_fmr_register_physical (koibnal_data.koib_fmr_pool,
+#if IBNAL_FMR
+        rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
                                        phys, nphys,
                                        &tx->tx_md.md_addr,
                                        page_offset,
@@ -694,7 +698,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
                                        &tx->tx_md.md_lkey,
                                        &tx->tx_md.md_rkey);
 #else
-        rc = ib_memory_register_physical (koibnal_data.koib_pd,
+        rc = ib_memory_register_physical (kibnal_data.kib_pd,
                                           phys, nphys,
                                           &tx->tx_md.md_addr,
                                           nob, page_offset,
@@ -717,24 +721,24 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
         return (rc);
 }
 
-koib_conn_t *
-koibnal_find_conn_locked (koib_peer_t *peer)
+kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
 {
         struct list_head *tmp;
 
         /* just return the first connection */
         list_for_each (tmp, &peer->ibp_conns) {
-                return (list_entry(tmp, koib_conn_t, ibc_list));
+                return (list_entry(tmp, kib_conn_t, ibc_list));
         }
 
         return (NULL);
 }
 
 void
-koibnal_check_sends (koib_conn_t *conn)
+kibnal_check_sends (kib_conn_t *conn)
 {
         unsigned long   flags;
-        koib_tx_t      *tx;
+        kib_tx_t       *tx;
         int             rc;
         int             i;
         int             done;
@@ -742,39 +746,39 @@ koibnal_check_sends (koib_conn_t *conn)
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
+        LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
         if (list_empty(&conn->ibc_tx_queue) &&
-            conn->ibc_outstanding_credits >= OPENIBNAL_CREDIT_HIGHWATER) {
+            conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
                 spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
-                tx = koibnal_get_idle_tx(0);     /* don't block */
+                
+                tx = kibnal_get_idle_tx(0);     /* don't block */
                 if (tx != NULL)
-                        koibnal_init_tx_msg(tx, OPENIBNAL_MSG_NOOP, 0);
+                        kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
 
                 spin_lock_irqsave(&conn->ibc_lock, flags);
-
+                
                 if (tx != NULL) {
                         atomic_inc(&conn->ibc_refcount);
-                        koibnal_queue_tx_locked(tx, conn);
+                        kibnal_queue_tx_locked(tx, conn);
                 }
         }
 
-        LASSERT (conn->ibc_nsends_posted <= OPENIBNAL_MSG_QUEUE_SIZE);
-
         while (!list_empty (&conn->ibc_tx_queue)) {
-                tx = list_entry (conn->ibc_tx_queue.next, koib_tx_t, tx_list);
+                tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
 
                 /* We rely on this for QP sizing */
                 LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2);
 
                 LASSERT (conn->ibc_outstanding_credits >= 0);
-                LASSERT (conn->ibc_outstanding_credits <= OPENIBNAL_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
                 LASSERT (conn->ibc_credits >= 0);
-                LASSERT (conn->ibc_credits <= OPENIBNAL_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
 
                 /* Not on ibc_rdma_queue */
                 LASSERT (!tx->tx_passive_rdma_wait);
 
-                if (conn->ibc_nsends_posted == OPENIBNAL_MSG_QUEUE_SIZE)
+                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
                         break;
 
                 if (conn->ibc_credits == 0)     /* no credits */
@@ -786,37 +790,29 @@ koibnal_check_sends (koib_conn_t *conn)
 
                 list_del (&tx->tx_list);
 
-                if (tx->tx_msg->oibm_type == OPENIBNAL_MSG_NOOP &&
+                if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
                     (!list_empty(&conn->ibc_tx_queue) ||
-                     conn->ibc_outstanding_credits < OPENIBNAL_CREDIT_HIGHWATER)) {
-                        /* Redundant NOOP */
+                     conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+                        /* redundant NOOP */
                         spin_unlock_irqrestore(&conn->ibc_lock, flags);
-                        koibnal_tx_done(tx);
+                        kibnal_tx_done(tx);
                         spin_lock_irqsave(&conn->ibc_lock, flags);
                         continue;
                 }
-                
-                /* incoming RDMA completion can find this one now */
-                if (tx->tx_passive_rdma) {
-                        list_add (&tx->tx_list, &conn->ibc_rdma_queue);
-                        tx->tx_passive_rdma_wait = 1;
-                        tx->tx_passive_rdma_deadline = 
-                                jiffies + koibnal_tunables.koib_io_timeout * HZ;
-                }
 
-                tx->tx_msg->oibm_credits = conn->ibc_outstanding_credits;
+                tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
                 conn->ibc_outstanding_credits = 0;
 
-                /* use the free memory barrier when we unlock to ensure
-                 * sending set before we can get the tx callback. */
                 conn->ibc_nsends_posted++;
                 conn->ibc_credits--;
-                tx->tx_sending = tx->tx_nsp;
 
-#if OPENIBNAL_CKSUM
-                tx->tx_msg->oibm_cksum = 0;
-                tx->tx_msg->oibm_cksum = koibnal_cksum(tx->tx_msg, tx->tx_msg->oibm_nob);
-                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->oibm_cksum, tx->tx_msg->oibm_nob);
+                tx->tx_sending = tx->tx_nsp;
+                tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+                list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+                tx->tx_msg->ibm_cksum = 0;
+                tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
 #endif
                 spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
@@ -827,7 +823,7 @@ koibnal_check_sends (koib_conn_t *conn)
 
                 rc = -ECONNABORTED;
                 nwork = 0;
-                if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
                         tx->tx_status = 0;
                         /* Driver only accepts 1 item at a time */
                         for (i = 0; i < tx->tx_nsp; i++) {
@@ -842,31 +838,31 @@ koibnal_check_sends (koib_conn_t *conn)
                 if (rc != 0) {
                         /* NB credits are transferred in the actual
                          * message, which can only be the last work item */
-                        conn->ibc_outstanding_credits += tx->tx_msg->oibm_credits;
+                        conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
                         conn->ibc_credits++;
                         conn->ibc_nsends_posted--;
-                        tx->tx_sending -= tx->tx_nsp - nwork;
+
                         tx->tx_status = rc;
+                        tx->tx_passive_rdma_wait = 0;
+                        tx->tx_sending -= tx->tx_nsp - nwork;
+
                         done = (tx->tx_sending == 0);
-                        
-                        if (tx->tx_passive_rdma) {
-                                tx->tx_passive_rdma_wait = 0;
+                        if (done)
                                 list_del (&tx->tx_list);
-                        }
                         
                         spin_unlock_irqrestore (&conn->ibc_lock, flags);
                         
-                        if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED)
+                        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
                                 CERROR ("Error %d posting transmit to "LPX64"\n", 
                                         rc, conn->ibc_peer->ibp_nid);
                         else
                                 CDEBUG (D_NET, "Error %d posting transmit to "
                                         LPX64"\n", rc, conn->ibc_peer->ibp_nid);
 
-                        koibnal_close_conn (conn, rc);
+                        kibnal_close_conn (conn, rc);
 
                         if (done)
-                                koibnal_tx_done (tx);
+                                kibnal_tx_done (tx);
                         return;
                 }
                 
@@ -876,10 +872,10 @@ koibnal_check_sends (koib_conn_t *conn)
 }
 
 void
-koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_tx_callback (struct ib_cq_entry *e)
 {
-        koib_tx_t    *tx = (koib_tx_t *)((unsigned long)e->work_request_id);
-        koib_conn_t  *conn;
+        kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id);
+        kib_conn_t   *conn;
         unsigned long flags;
         int           idle;
 
@@ -901,6 +897,8 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
         tx->tx_sending--;
         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
                (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
+        if (idle)
+                list_del(&tx->tx_list);
 
         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
@@ -917,53 +915,62 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
         spin_unlock_irqrestore(&conn->ibc_lock, flags);
 
         if (idle)
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
 
         if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
                 CERROR ("Tx completion to "LPX64" failed: %d\n", 
                         conn->ibc_peer->ibp_nid, e->status);
-                koibnal_close_conn (conn, -ENETDOWN);
+                kibnal_close_conn (conn, -ENETDOWN);
         } else {
                 /* can I shovel some more sends out the door? */
-                koibnal_check_sends(conn);
+                kibnal_check_sends(conn);
         }
 
-        koibnal_put_conn (conn);
+        kibnal_put_conn (conn);
 }
 
 void
-koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob)
+kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+{
+        if (kibnal_wreqid_is_rx(e->work_request_id))
+                kibnal_rx_callback (e);
+        else
+                kibnal_tx_callback (e);
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
 {
         struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp];
         struct ib_send_param     *sp = &tx->tx_sp[tx->tx_nsp];
         int                       fence;
-        int                       nob = offsetof (koib_msg_t, oibm_u) + body_nob;
+        int                       nob = offsetof (kib_msg_t, ibm_u) + body_nob;
 
         LASSERT (tx->tx_nsp >= 0 && 
                  tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0]));
-        LASSERT (nob <= OPENIBNAL_MSG_SIZE);
+        LASSERT (nob <= IBNAL_MSG_SIZE);
         
-        tx->tx_msg->oibm_magic = OPENIBNAL_MSG_MAGIC;
-        tx->tx_msg->oibm_version = OPENIBNAL_MSG_VERSION;
-        tx->tx_msg->oibm_type = type;
-#if OPENIBNAL_CKSUM
-        tx->tx_msg->oibm_nob = nob;
+        tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+        tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+        tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+        tx->tx_msg->ibm_nob = nob;
 #endif
         /* Fence the message if it's bundled with an RDMA read */
         fence = (tx->tx_nsp > 0) &&
-                (type == OPENIBNAL_MSG_PUT_DONE);
+                (type == IBNAL_MSG_PUT_DONE);
 
         *gl = (struct ib_gather_scatter) {
                 .address = tx->tx_vaddr,
                 .length  = nob,
-                .key     = koibnal_data.koib_tx_pages->oibp_lkey,
+                .key     = kibnal_data.kib_tx_pages->ibp_lkey,
         };
 
         /* NB If this is an RDMA read, the completion message must wait for
          * the RDMA to complete.  Sends wait for previous RDMA writes
          * anyway... */
         *sp = (struct ib_send_param) {
-                .work_request_id      = (__u64)((unsigned long)tx),
+                .work_request_id      = kibnal_ptr2wreqid(tx, 0),
                 .op                   = IB_OP_SEND,
                 .gather_list          = gl,
                 .num_gather_entries   = 1,
@@ -979,26 +986,26 @@ koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob)
 }
 
 void
-koibnal_queue_tx (koib_tx_t *tx, koib_conn_t *conn)
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
 {
         unsigned long         flags;
 
         spin_lock_irqsave(&conn->ibc_lock, flags);
 
-        koibnal_queue_tx_locked (tx, conn);
+        kibnal_queue_tx_locked (tx, conn);
         
         spin_unlock_irqrestore(&conn->ibc_lock, flags);
         
-        koibnal_check_sends(conn);
+        kibnal_check_sends(conn);
 }
 
 void
-koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
 {
         unsigned long    flags;
-        koib_peer_t     *peer;
-        koib_conn_t     *conn;
-        rwlock_t        *g_lock = &koibnal_data.koib_global_lock;
+        kib_peer_t      *peer;
+        kib_conn_t      *conn;
+        rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
 
         /* If I get here, I've committed to send, so I complete the tx with
          * failure on any problems */
@@ -1008,15 +1015,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
 
         read_lock (g_lock);
         
-        peer = koibnal_find_peer_locked (nid);
+        peer = kibnal_find_peer_locked (nid);
         if (peer == NULL) {
                 read_unlock (g_lock);
                 tx->tx_status = -EHOSTUNREACH;
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
                 return;
         }
 
-        conn = koibnal_find_conn_locked (peer);
+        conn = kibnal_find_conn_locked (peer);
         if (conn != NULL) {
                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
@@ -1024,7 +1031,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
                 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
                 read_unlock (g_lock);
                 
-                koibnal_queue_tx (tx, conn);
+                kibnal_queue_tx (tx, conn);
                 return;
         }
         
@@ -1032,15 +1039,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
         read_unlock (g_lock);
         write_lock_irqsave (g_lock, flags);
 
-        peer = koibnal_find_peer_locked (nid);
+        peer = kibnal_find_peer_locked (nid);
         if (peer == NULL) {
                 write_unlock_irqrestore (g_lock, flags);
                 tx->tx_status = -EHOSTUNREACH;
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
                 return;
         }
 
-        conn = koibnal_find_conn_locked (peer);
+        conn = kibnal_find_conn_locked (peer);
         if (conn != NULL) {
                 /* Connection exists; queue message on it */
                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
@@ -1049,7 +1056,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
                 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
                 write_unlock_irqrestore (g_lock, flags);
                 
-                koibnal_queue_tx (tx, conn);
+                kibnal_queue_tx (tx, conn);
                 return;
         }
 
@@ -1057,20 +1064,20 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
                 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
                         write_unlock_irqrestore (g_lock, flags);
                         tx->tx_status = -EHOSTUNREACH;
-                        koibnal_tx_done (tx);
+                        kibnal_tx_done (tx);
                         return;
                 }
         
                 peer->ibp_connecting = 1;
                 atomic_inc (&peer->ibp_refcount); /* extra ref for connd */
         
-                spin_lock (&koibnal_data.koib_connd_lock);
+                spin_lock (&kibnal_data.kib_connd_lock);
         
                 list_add_tail (&peer->ibp_connd_list,
-                               &koibnal_data.koib_connd_peers);
-                wake_up (&koibnal_data.koib_connd_waitq);
+                               &kibnal_data.kib_connd_peers);
+                wake_up (&kibnal_data.kib_connd_waitq);
         
-                spin_unlock (&koibnal_data.koib_connd_lock);
+                spin_unlock (&kibnal_data.kib_connd_lock);
         }
         
         /* A connection is being established; queue the message... */
@@ -1080,49 +1087,49 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
 }
 
 ptl_err_t
-koibnal_start_passive_rdma (int type, ptl_nid_t nid,
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
                             lib_msg_t *libmsg, ptl_hdr_t *hdr)
 {
         int         nob = libmsg->md->length;
-        koib_tx_t  *tx;
-        koib_msg_t *oibmsg;
+        kib_tx_t   *tx;
+        kib_msg_t  *ibmsg;
         int         rc;
         int         access;
         
-        LASSERT (type == OPENIBNAL_MSG_PUT_RDMA || 
-                 type == OPENIBNAL_MSG_GET_RDMA);
+        LASSERT (type == IBNAL_MSG_PUT_RDMA || 
+                 type == IBNAL_MSG_GET_RDMA);
         LASSERT (nob > 0);
         LASSERT (!in_interrupt());              /* Mapping could block */
 
-        if (type == OPENIBNAL_MSG_PUT_RDMA) {
+        if (type == IBNAL_MSG_PUT_RDMA) {
                 access = IB_ACCESS_REMOTE_READ;
         } else {
                 access = IB_ACCESS_REMOTE_WRITE |
                          IB_ACCESS_LOCAL_WRITE;
         }
 
-        tx = koibnal_get_idle_tx (1);           /* May block; caller is an app thread */
+        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
         LASSERT (tx != NULL);
 
         if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
-                rc = koibnal_map_iov (tx, access,
-                                      libmsg->md->md_niov,
-                                      libmsg->md->md_iov.iov,
-                                      0, nob);
+                rc = kibnal_map_iov (tx, access,
+                                     libmsg->md->md_niov,
+                                     libmsg->md->md_iov.iov,
+                                     0, nob);
         else
-                rc = koibnal_map_kiov (tx, access,
-                                       libmsg->md->md_niov, 
-                                       libmsg->md->md_iov.kiov,
-                                       0, nob);
+                rc = kibnal_map_kiov (tx, access,
+                                      libmsg->md->md_niov, 
+                                      libmsg->md->md_iov.kiov,
+                                      0, nob);
 
         if (rc != 0) {
                 CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
                 goto failed;
         }
         
-        if (type == OPENIBNAL_MSG_GET_RDMA) {
+        if (type == IBNAL_MSG_GET_RDMA) {
                 /* reply gets finalized when tx completes */
-                tx->tx_libmsg[1] = lib_create_reply_msg(&koibnal_lib, 
+                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, 
                                                         nid, libmsg);
                 if (tx->tx_libmsg[1] == NULL) {
                         CERROR ("Can't create reply for GET -> "LPX64"\n",
@@ -1134,15 +1141,15 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid,
         
         tx->tx_passive_rdma = 1;
 
-        oibmsg = tx->tx_msg;
+        ibmsg = tx->tx_msg;
 
-        oibmsg->oibm_u.rdma.oibrm_hdr = *hdr;
-        oibmsg->oibm_u.rdma.oibrm_cookie = tx->tx_passive_rdma_cookie;
-        oibmsg->oibm_u.rdma.oibrm_desc.rd_key = tx->tx_md.md_rkey;
-        oibmsg->oibm_u.rdma.oibrm_desc.rd_addr = tx->tx_md.md_addr;
-        oibmsg->oibm_u.rdma.oibrm_desc.rd_nob = nob;
+        ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+        ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+        ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey;
+        ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr;
+        ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob;
 
-        koibnal_init_tx_msg (tx, type, sizeof (koib_rdma_msg_t));
+        kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t));
 
         CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
                LPX64", nob %d\n",
@@ -1152,25 +1159,25 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid,
         /* libmsg gets finalized when tx completes. */
         tx->tx_libmsg[0] = libmsg;
 
-        koibnal_launch_tx(tx, nid);
+        kibnal_launch_tx(tx, nid);
         return (PTL_OK);
 
  failed:
         tx->tx_status = rc;
-        koibnal_tx_done (tx);
+        kibnal_tx_done (tx);
         return (PTL_FAIL);
 }
 
 void
-koibnal_start_active_rdma (int type, int status,
-                           koib_rx_t *rx, lib_msg_t *libmsg, 
+kibnal_start_active_rdma (int type, int status,
+                           kib_rx_t *rx, lib_msg_t *libmsg, 
                            unsigned int niov,
                            struct iovec *iov, ptl_kiov_t *kiov,
                            size_t offset, size_t nob)
 {
-        koib_msg_t   *rxmsg = rx->rx_msg;
-        koib_msg_t   *txmsg;
-        koib_tx_t    *tx;
+        kib_msg_t    *rxmsg = rx->rx_msg;
+        kib_msg_t    *txmsg;
+        kib_tx_t     *tx;
         int           access;
         int           rdma_op;
         int           rc;
@@ -1187,8 +1194,8 @@ koibnal_start_active_rdma (int type, int status,
         /* No data if we're completing with failure */
         LASSERT (status == 0 || nob == 0);
 
-        LASSERT (type == OPENIBNAL_MSG_GET_DONE ||
-                 type == OPENIBNAL_MSG_PUT_DONE);
+        LASSERT (type == IBNAL_MSG_GET_DONE ||
+                 type == IBNAL_MSG_PUT_DONE);
 
         /* Flag I'm completing the RDMA.  Even if I fail to send the
          * completion message, I will have tried my best so further
@@ -1196,22 +1203,22 @@ koibnal_start_active_rdma (int type, int status,
         LASSERT (!rx->rx_rdma);
         rx->rx_rdma = 1;
 
-        if (type == OPENIBNAL_MSG_GET_DONE) {
+        if (type == IBNAL_MSG_GET_DONE) {
                 access   = 0;
                 rdma_op  = IB_OP_RDMA_WRITE;
-                LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_GET_RDMA);
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
         } else {
                 access   = IB_ACCESS_LOCAL_WRITE;
                 rdma_op  = IB_OP_RDMA_READ;
-                LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_PUT_RDMA);
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
         }
 
-        tx = koibnal_get_idle_tx (0);           /* Mustn't block */
+        tx = kibnal_get_idle_tx (0);           /* Mustn't block */
         if (tx == NULL) {
                 CERROR ("tx descs exhausted on RDMA from "LPX64
                         " completing locally with failure\n",
-                         rx->rx_conn->ibc_peer->ibp_nid);
-                lib_finalize (&koibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
                 return;
         }
         LASSERT (tx->tx_nsp == 0);
@@ -1222,11 +1229,11 @@ koibnal_start_active_rdma (int type, int status,
                  * message is matched) */
 
                 if (kiov != NULL)
-                        rc = koibnal_map_kiov (tx, access,
-                                               niov, kiov, offset, nob);
+                        rc = kibnal_map_kiov (tx, access,
+                                              niov, kiov, offset, nob);
                 else
-                        rc = koibnal_map_iov (tx, access,
-                                              niov, iov, offset, nob);
+                        rc = kibnal_map_iov (tx, access,
+                                             niov, iov, offset, nob);
                 
                 if (rc != 0) {
                         CERROR ("Can't map RDMA -> "LPX64": %d\n", 
@@ -1242,12 +1249,12 @@ koibnal_start_active_rdma (int type, int status,
                         };
                 
                         tx->tx_sp[0] = (struct ib_send_param) {
-                                .work_request_id      = (__u64)((unsigned long)tx),
+                                .work_request_id      = kibnal_ptr2wreqid(tx, 0),
                                 .op                   = rdma_op,
                                 .gather_list          = &tx->tx_gl[0],
                                 .num_gather_entries   = 1,
-                                .remote_address       = rxmsg->oibm_u.rdma.oibrm_desc.rd_addr,
-                                .rkey                 = rxmsg->oibm_u.rdma.oibrm_desc.rd_key,
+                                .remote_address       = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr,
+                                .rkey                 = rxmsg->ibm_u.rdma.ibrm_desc.rd_key,
                                 .device_specific      = NULL,
                                 .solicited_event      = 0,
                                 .signaled             = 1,
@@ -1262,10 +1269,10 @@ koibnal_start_active_rdma (int type, int status,
 
         txmsg = tx->tx_msg;
 
-        txmsg->oibm_u.completion.oibcm_cookie = rxmsg->oibm_u.rdma.oibrm_cookie;
-        txmsg->oibm_u.completion.oibcm_status = status;
+        txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+        txmsg->ibm_u.completion.ibcm_status = status;
         
-        koibnal_init_tx_msg(tx, type, sizeof (koib_completion_msg_t));
+        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
 
         if (status == 0 && nob != 0) {
                 LASSERT (tx->tx_nsp > 1);
@@ -1277,7 +1284,7 @@ koibnal_start_active_rdma (int type, int status,
                 LASSERT (tx->tx_nsp == 1);
                 /* No RDMA: local completion happens now! */
                 CDEBUG(D_WARNING,"No data: immediate completion\n");
-                lib_finalize (&koibnal_lib, NULL, libmsg,
+                lib_finalize (&kibnal_lib, NULL, libmsg,
                               status == 0 ? PTL_OK : PTL_FAIL);
         }
 
@@ -1288,11 +1295,11 @@ koibnal_start_active_rdma (int type, int status,
                atomic_read (&rx->rx_conn->ibc_refcount));
         atomic_inc (&rx->rx_conn->ibc_refcount);
         /* ...and queue it up */
-        koibnal_queue_tx(tx, rx->rx_conn);
+        kibnal_queue_tx(tx, rx->rx_conn);
 }
 
 ptl_err_t
-koibnal_sendmsg(lib_nal_t    *nal, 
+kibnal_sendmsg(lib_nal_t    *nal, 
                 void         *private,
                 lib_msg_t    *libmsg,
                 ptl_hdr_t    *hdr, 
@@ -1305,8 +1312,8 @@ koibnal_sendmsg(lib_nal_t    *nal,
                 size_t        payload_offset,
                 size_t        payload_nob)
 {
-        koib_msg_t *oibmsg;
-        koib_tx_t  *tx;
+        kib_msg_t  *ibmsg;
+        kib_tx_t   *tx;
         int         nob;
 
         /* NB 'private' is different depending on what we're sending.... */
@@ -1329,27 +1336,27 @@ koibnal_sendmsg(lib_nal_t    *nal,
                 
         case PTL_MSG_REPLY: {
                 /* reply's 'private' is the incoming receive */
-                koib_rx_t *rx = private;
+                kib_rx_t *rx = private;
 
                 /* RDMA reply expected? */
-                if (rx->rx_msg->oibm_type == OPENIBNAL_MSG_GET_RDMA) {
-                        koibnal_start_active_rdma(OPENIBNAL_MSG_GET_DONE, 0,
-                                                  rx, libmsg, payload_niov, 
-                                                  payload_iov, payload_kiov,
-                                                  payload_offset, payload_nob);
+                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+                                                 rx, libmsg, payload_niov, 
+                                                 payload_iov, payload_kiov,
+                                                 payload_offset, payload_nob);
                         return (PTL_OK);
                 }
                 
                 /* Incoming message consistent with immediate reply? */
-                if (rx->rx_msg->oibm_type != OPENIBNAL_MSG_IMMEDIATE) {
+                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
                         CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
-                                nid, rx->rx_msg->oibm_type);
+                                nid, rx->rx_msg->ibm_type);
                         return (PTL_FAIL);
                 }
 
                 /* Will it fit in a message? */
-                nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]);
-                if (nob >= OPENIBNAL_MSG_SIZE) {
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob >= IBNAL_MSG_SIZE) {
                         CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", 
                                nid, payload_nob);
                         return (PTL_FAIL);
@@ -1359,10 +1366,10 @@ koibnal_sendmsg(lib_nal_t    *nal,
 
         case PTL_MSG_GET:
                 /* might the REPLY message be big enough to need RDMA? */
-                nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[libmsg->md->length]);
-                if (nob > OPENIBNAL_MSG_SIZE)
-                        return (koibnal_start_passive_rdma(OPENIBNAL_MSG_GET_RDMA, 
-                                                           nid, libmsg, hdr));
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
+                                                          nid, libmsg, hdr));
                 break;
 
         case PTL_MSG_ACK:
@@ -1371,181 +1378,181 @@ koibnal_sendmsg(lib_nal_t    *nal,
 
         case PTL_MSG_PUT:
                 /* Is the payload big enough to need RDMA? */
-                nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]);
-                if (nob > OPENIBNAL_MSG_SIZE)
-                        return (koibnal_start_passive_rdma(OPENIBNAL_MSG_PUT_RDMA,
-                                                           nid, libmsg, hdr));
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+                                                          nid, libmsg, hdr));
                 
                 break;
         }
 
-        tx = koibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
-                                   type == PTL_MSG_REPLY ||
-                                   in_interrupt()));
+        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt()));
         if (tx == NULL) {
                 CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
                         type, nid, in_interrupt() ? " (intr)" : "");
                 return (PTL_NO_SPACE);
         }
 
-        oibmsg = tx->tx_msg;
-        oibmsg->oibm_u.immediate.oibim_hdr = *hdr;
+        ibmsg = tx->tx_msg;
+        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
 
         if (payload_nob > 0) {
                 if (payload_kiov != NULL)
-                        lib_copy_kiov2buf(oibmsg->oibm_u.immediate.oibim_payload,
+                        lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
                                           payload_niov, payload_kiov,
                                           payload_offset, payload_nob);
                 else
-                        lib_copy_iov2buf(oibmsg->oibm_u.immediate.oibim_payload,
+                        lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
                                          payload_niov, payload_iov,
                                          payload_offset, payload_nob);
         }
 
-        koibnal_init_tx_msg (tx, OPENIBNAL_MSG_IMMEDIATE,
-                             offsetof(koib_immediate_msg_t, 
-                                      oibim_payload[payload_nob]));
+        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+                            offsetof(kib_immediate_msg_t, 
+                                     ibim_payload[payload_nob]));
 
         /* libmsg gets finalized when tx completes */
         tx->tx_libmsg[0] = libmsg;
 
-        koibnal_launch_tx(tx, nid);
+        kibnal_launch_tx(tx, nid);
         return (PTL_OK);
 }
 
 ptl_err_t
-koibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                unsigned int payload_niov, struct iovec *payload_iov,
                size_t payload_offset, size_t payload_len)
 {
-        return (koibnal_sendmsg(nal, private, cookie,
-                                 hdr, type, nid, pid,
-                                 payload_niov, payload_iov, NULL,
-                                 payload_offset, payload_len));
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, payload_iov, NULL,
+                               payload_offset, payload_len));
 }
 
 ptl_err_t
-koibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
                      size_t payload_offset, size_t payload_len)
 {
-        return (koibnal_sendmsg(nal, private, cookie,
-                                 hdr, type, nid, pid,
-                                 payload_niov, NULL, payload_kiov,
-                                 payload_offset, payload_len));
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, NULL, payload_kiov,
+                               payload_offset, payload_len));
 }
 
 ptl_err_t
-koibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
                  unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
                  size_t offset, size_t mlen, size_t rlen)
 {
-        koib_rx_t                *rx = private;
-        koib_msg_t               *rxmsg = rx->rx_msg;
-        int                       msg_nob;
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        int          msg_nob;
         
         LASSERT (mlen <= rlen);
         LASSERT (!in_interrupt ());
         /* Either all pages or all vaddrs */
         LASSERT (!(kiov != NULL && iov != NULL));
 
-        switch (rxmsg->oibm_type) {
+        switch (rxmsg->ibm_type) {
         default:
                 LBUG();
                 return (PTL_FAIL);
                 
-        case OPENIBNAL_MSG_IMMEDIATE:
-                msg_nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[rlen]);
-                if (msg_nob > OPENIBNAL_MSG_SIZE) {
+        case IBNAL_MSG_IMMEDIATE:
+                msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (msg_nob > IBNAL_MSG_SIZE) {
                         CERROR ("Immediate message from "LPX64" too big: %d\n",
-                                rxmsg->oibm_u.immediate.oibim_hdr.src_nid, rlen);
+                                rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
                         return (PTL_FAIL);
                 }
 
                 if (kiov != NULL)
                         lib_copy_buf2kiov(niov, kiov, offset,
-                                          rxmsg->oibm_u.immediate.oibim_payload,
+                                          rxmsg->ibm_u.immediate.ibim_payload,
                                           mlen);
                 else
                         lib_copy_buf2iov(niov, iov, offset,
-                                         rxmsg->oibm_u.immediate.oibim_payload,
+                                         rxmsg->ibm_u.immediate.ibim_payload,
                                          mlen);
 
                 lib_finalize (nal, NULL, libmsg, PTL_OK);
                 return (PTL_OK);
 
-        case OPENIBNAL_MSG_GET_RDMA:
+        case IBNAL_MSG_GET_RDMA:
                 /* We get called here just to discard any junk after the
                  * GET hdr. */
                 LASSERT (libmsg == NULL);
                 lib_finalize (nal, NULL, libmsg, PTL_OK);
                 return (PTL_OK);
 
-        case OPENIBNAL_MSG_PUT_RDMA:
-                koibnal_start_active_rdma (OPENIBNAL_MSG_PUT_DONE, 0,
-                                           rx, libmsg, 
-                                           niov, iov, kiov, offset, mlen);
+        case IBNAL_MSG_PUT_RDMA:
+                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+                                          rx, libmsg, 
+                                          niov, iov, kiov, offset, mlen);
                 return (PTL_OK);
         }
 }
 
 ptl_err_t
-koibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
               unsigned int niov, struct iovec *iov, 
               size_t offset, size_t mlen, size_t rlen)
 {
-        return (koibnal_recvmsg (nal, private, msg, niov, iov, NULL,
-                                 offset, mlen, rlen));
+        return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+                                offset, mlen, rlen));
 }
 
 ptl_err_t
-koibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
                      unsigned int niov, ptl_kiov_t *kiov, 
                      size_t offset, size_t mlen, size_t rlen)
 {
-        return (koibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
-                                 offset, mlen, rlen));
+        return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+                                offset, mlen, rlen));
 }
 
 int
-koibnal_thread_start (int (*fn)(void *arg), void *arg)
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
 {
         long    pid = kernel_thread (fn, arg, 0);
 
         if (pid < 0)
                 return ((int)pid);
 
-        atomic_inc (&koibnal_data.koib_nthreads);
+        atomic_inc (&kibnal_data.kib_nthreads);
         return (0);
 }
 
 void
-koibnal_thread_fini (void)
+kibnal_thread_fini (void)
 {
-        atomic_dec (&koibnal_data.koib_nthreads);
+        atomic_dec (&kibnal_data.kib_nthreads);
 }
 
 void
-koibnal_close_conn_locked (koib_conn_t *conn, int error)
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
 {
         /* This just does the immmediate housekeeping, and schedules the
          * connection for the connd to finish off.
-         * Caller holds koib_global_lock exclusively in irq context */
-        koib_peer_t   *peer = conn->ibc_peer;
+         * Caller holds kib_global_lock exclusively in irq context */
+        kib_peer_t   *peer = conn->ibc_peer;
 
         CDEBUG (error == 0 ? D_NET : D_ERROR,
                 "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
         
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED ||
-                 conn->ibc_state == OPENIBNAL_CONN_CONNECTING);
+        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED ||
+                 conn->ibc_state == IBNAL_CONN_CONNECTING);
 
-        if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
-                /* koib_connd_conns takes ibc_list's ref */
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                /* kib_connd_conns takes ibc_list's ref */
                 list_del (&conn->ibc_list);
         } else {
-                /* new ref for koib_connd_conns */
+                /* new ref for kib_connd_conns */
                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                        atomic_read (&conn->ibc_refcount));
@@ -1555,57 +1562,57 @@ koibnal_close_conn_locked (koib_conn_t *conn, int error)
         if (list_empty (&peer->ibp_conns) &&
             peer->ibp_persistence == 0) {
                 /* Non-persistent peer with no more conns... */
-                koibnal_unlink_peer_locked (peer);
+                kibnal_unlink_peer_locked (peer);
         }
 
-        conn->ibc_state = OPENIBNAL_CONN_DEATHROW;
+        conn->ibc_state = IBNAL_CONN_DEATHROW;
 
         /* Schedule conn for closing/destruction */
-        spin_lock (&koibnal_data.koib_connd_lock);
+        spin_lock (&kibnal_data.kib_connd_lock);
 
-        list_add_tail (&conn->ibc_list, &koibnal_data.koib_connd_conns);
-        wake_up (&koibnal_data.koib_connd_waitq);
+        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
                 
-        spin_unlock (&koibnal_data.koib_connd_lock);
+        spin_unlock (&kibnal_data.kib_connd_lock);
 }
 
 int
-koibnal_close_conn (koib_conn_t *conn, int why)
+kibnal_close_conn (kib_conn_t *conn, int why)
 {
         unsigned long     flags;
         int               count = 0;
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        LASSERT (conn->ibc_state >= OPENIBNAL_CONN_CONNECTING);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
         
-        if (conn->ibc_state <= OPENIBNAL_CONN_ESTABLISHED) {
+        if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) {
                 count = 1;
-                koibnal_close_conn_locked (conn, why);
+                kibnal_close_conn_locked (conn, why);
         }
         
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
         return (count);
 }
 
 void
-koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc)
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
 {
         LIST_HEAD        (zombies);
-        koib_tx_t        *tx;
+        kib_tx_t         *tx;
         unsigned long     flags;
 
         LASSERT (rc != 0);
-        LASSERT (peer->ibp_reconnect_interval >= OPENIBNAL_MIN_RECONNECT_INTERVAL);
+        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         LASSERT (peer->ibp_connecting != 0);
         peer->ibp_connecting--;
 
         if (peer->ibp_connecting != 0) {
                 /* another connection attempt under way (loopback?)... */
-                write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
                 return;
         }
 
@@ -1614,50 +1621,50 @@ koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc)
                 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
                 /* Increase reconnection interval */
                 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
-                                                    OPENIBNAL_MAX_RECONNECT_INTERVAL);
+                                                    IBNAL_MAX_RECONNECT_INTERVAL);
         
                 /* Take peer's blocked blocked transmits; I'll complete
                  * them with error */
                 while (!list_empty (&peer->ibp_tx_queue)) {
                         tx = list_entry (peer->ibp_tx_queue.next,
-                                         koib_tx_t, tx_list);
+                                         kib_tx_t, tx_list);
                         
                         list_del (&tx->tx_list);
                         list_add_tail (&tx->tx_list, &zombies);
                 }
                 
-                if (koibnal_peer_active(peer) &&
+                if (kibnal_peer_active(peer) &&
                     (peer->ibp_persistence == 0)) {
                         /* failed connection attempt on non-persistent peer */
-                        koibnal_unlink_peer_locked (peer);
+                        kibnal_unlink_peer_locked (peer);
                 }
         } else {
                 /* Can't have blocked transmits if there are connections */
                 LASSERT (list_empty(&peer->ibp_tx_queue));
         }
         
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         if (!list_empty (&zombies))
                 CERROR ("Deleting messages for "LPX64": connection failed\n",
                         peer->ibp_nid);
 
         while (!list_empty (&zombies)) {
-                tx = list_entry (zombies.next, koib_tx_t, tx_list);
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
 
                 list_del (&tx->tx_list);
                 /* complete now */
                 tx->tx_status = -EHOSTUNREACH;
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
         }
 }
 
 void
-koibnal_connreq_done (koib_conn_t *conn, int active, int status)
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
 {
         int               state = conn->ibc_state;
-        koib_peer_t      *peer = conn->ibc_peer;
-        koib_tx_t        *tx;
+        kib_peer_t       *peer = conn->ibc_peer;
+        kib_tx_t         *tx;
         unsigned long     flags;
         int               rc;
         int               i;
@@ -1669,31 +1676,31 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                 conn->ibc_connreq = NULL;
         }
 
-        if (state == OPENIBNAL_CONN_CONNECTING) {
+        if (state == IBNAL_CONN_CONNECTING) {
                 /* Install common (active/passive) callback for
                  * disconnect/idle notification if I got as far as getting
                  * a CM comm_id */
                 rc = tsIbCmCallbackModify(conn->ibc_comm_id, 
-                                          koibnal_conn_callback, conn);
+                                          kibnal_conn_callback, conn);
                 LASSERT (rc == 0);
         }
         
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         LASSERT (peer->ibp_connecting != 0);
         
         if (status == 0) {                         
                 /* connection established... */
-                LASSERT (state == OPENIBNAL_CONN_CONNECTING);
-                conn->ibc_state = OPENIBNAL_CONN_ESTABLISHED;
+                LASSERT (state == IBNAL_CONN_CONNECTING);
+                conn->ibc_state = IBNAL_CONN_ESTABLISHED;
 
-                if (!koibnal_peer_active(peer)) {
+                if (!kibnal_peer_active(peer)) {
                         /* ...but peer deleted meantime */
                         status = -ECONNABORTED;
                 }
         } else {
-                LASSERT (state == OPENIBNAL_CONN_INIT_QP ||
-                         state == OPENIBNAL_CONN_CONNECTING);
+                LASSERT (state == IBNAL_CONN_INIT_QP ||
+                         state == IBNAL_CONN_CONNECTING);
         }
 
         if (status == 0) {
@@ -1710,14 +1717,14 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                 list_add (&conn->ibc_list, &peer->ibp_conns);
                 
                 /* reset reconnect interval for next attempt */
-                peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
+                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
 
                 /* post blocked sends to the new connection */
                 spin_lock (&conn->ibc_lock);
                 
                 while (!list_empty (&peer->ibp_tx_queue)) {
                         tx = list_entry (peer->ibp_tx_queue.next, 
-                                         koib_tx_t, tx_list);
+                                         kib_tx_t, tx_list);
                         
                         list_del (&tx->tx_list);
 
@@ -1726,19 +1733,19 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                                atomic_read (&conn->ibc_refcount));
                         atomic_inc (&conn->ibc_refcount);
-                        koibnal_queue_tx_locked (tx, conn);
+                        kibnal_queue_tx_locked (tx, conn);
                 }
                 
                 spin_unlock (&conn->ibc_lock);
 
                 /* Nuke any dangling conns from a different peer instance... */
-                koibnal_close_stale_conns_locked (conn->ibc_peer,
-                                                  conn->ibc_incarnation);
+                kibnal_close_stale_conns_locked (conn->ibc_peer,
+                                                 conn->ibc_incarnation);
 
-                write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
                 /* queue up all the receives */
-                for (i = 0; i < OPENIBNAL_RX_MSGS; i++) {
+                for (i = 0; i < IBNAL_RX_MSGS; i++) {
                         /* +1 ref for rx desc */
                         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
@@ -1749,71 +1756,71 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                                i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
                                conn->ibc_rxs[i].rx_vaddr);
 
-                        koibnal_post_rx (&conn->ibc_rxs[i], 0);
+                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
                 }
 
-                koibnal_check_sends (conn);
+                kibnal_check_sends (conn);
                 return;
         }
 
         /* connection failed */
-        if (state == OPENIBNAL_CONN_CONNECTING) {
+        if (state == IBNAL_CONN_CONNECTING) {
                 /* schedule for connd to close */
-                koibnal_close_conn_locked (conn, status);
+                kibnal_close_conn_locked (conn, status);
         } else {
                 /* Don't have a CM comm_id; just wait for refs to drain */
-                conn->ibc_state = OPENIBNAL_CONN_ZOMBIE;
+                conn->ibc_state = IBNAL_CONN_ZOMBIE;
         } 
 
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
-        koibnal_peer_connect_failed (conn->ibc_peer, active, status);
+        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
 
-        if (state != OPENIBNAL_CONN_CONNECTING) {
+        if (state != IBNAL_CONN_CONNECTING) {
                 /* drop caller's ref if we're not waiting for the
                  * IB_CM_IDLE callback */
-                koibnal_put_conn (conn);
+                kibnal_put_conn (conn);
         }
 }
 
 int
-koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
+kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
                 ptl_nid_t nid, __u64 incarnation, int queue_depth)
 {
-        koib_conn_t   *conn = koibnal_create_conn();
-        koib_peer_t   *peer;
-        koib_peer_t   *peer2;
+        kib_conn_t    *conn = kibnal_create_conn();
+        kib_peer_t    *peer;
+        kib_peer_t    *peer2;
         unsigned long  flags;
 
         if (conn == NULL)
                 return (-ENOMEM);
 
-        if (queue_depth != OPENIBNAL_MSG_QUEUE_SIZE) {
+        if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
                 CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
-                       nid, queue_depth, OPENIBNAL_MSG_QUEUE_SIZE);
+                       nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
                 return (-EPROTO);
         }
         
         /* assume 'nid' is a new peer */
-        peer = koibnal_create_peer (nid);
+        peer = kibnal_create_peer (nid);
         if (peer == NULL) {
                 CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                        atomic_read (&conn->ibc_refcount));
                 atomic_dec (&conn->ibc_refcount);
-                koibnal_destroy_conn(conn);
+                kibnal_destroy_conn(conn);
                 return (-ENOMEM);
         }
         
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        peer2 = koibnal_find_peer_locked(nid);
+        peer2 = kibnal_find_peer_locked(nid);
         if (peer2 == NULL) {
                 /* peer table takes my ref on peer */
                 list_add_tail (&peer->ibp_list,
-                               koibnal_nid2peerlist(nid));
+                               kibnal_nid2peerlist(nid));
         } else {
-                koibnal_put_peer (peer);
+                kibnal_put_peer (peer);
                 peer = peer2;
         }
 
@@ -1821,20 +1828,20 @@ koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
         atomic_inc (&peer->ibp_refcount);
         peer->ibp_connecting++;
 
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         conn->ibc_peer = peer;
-        conn->ibc_state = OPENIBNAL_CONN_CONNECTING;
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
         conn->ibc_comm_id = cid;
         conn->ibc_incarnation = incarnation;
-        conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE;
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
 
         *connp = conn;
         return (0);
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
                             tTS_IB_CM_COMM_ID cid,
                             void *param,
                             void *arg)
@@ -1846,13 +1853,19 @@ koibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_conn_callback (tTS_IB_CM_EVENT event,
                        tTS_IB_CM_COMM_ID cid,
                        void *param,
                        void *arg)
 {
-        koib_conn_t *conn = arg;
-        int          rc;
+        kib_conn_t       *conn = arg;
+        LIST_HEAD        (zombies); 
+        struct list_head *tmp;
+        struct list_head *nxt;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               done;
+        int               rc;
 
         /* Established Connection Notifier */
 
@@ -1860,24 +1873,72 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event,
         default:
                 CERROR("Connection %p -> "LPX64" ERROR %d\n",
                        conn, conn->ibc_peer->ibp_nid, event);
-                koibnal_close_conn (conn, -ECONNABORTED);
+                kibnal_close_conn (conn, -ECONNABORTED);
                 break;
                 
         case TS_IB_CM_DISCONNECTED:
                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n",
                        conn, conn->ibc_peer->ibp_nid);
-                koibnal_close_conn (conn, 0);
+                kibnal_close_conn (conn, 0);
                 break;
 
         case TS_IB_CM_IDLE:
                 CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n",
                        conn, conn->ibc_peer->ibp_nid);
-                koibnal_put_conn (conn);        /* Lose CM's ref */
+                kibnal_put_conn (conn);        /* Lose CM's ref */
 
                 /* LASSERT (no further callbacks) */
                 rc = tsIbCmCallbackModify(cid, 
-                                          koibnal_idle_conn_callback, conn);
+                                          kibnal_idle_conn_callback, conn);
                 LASSERT (rc == 0);
+
+                /* NB we wait until the connection has closed before
+                 * completing outstanding passive RDMAs so we can be sure
+                 * the network can't touch the mapped memory any more. */
+
+                spin_lock_irqsave (&conn->ibc_lock, flags);
+
+                /* grab passive RDMAs not waiting for the tx callback */
+                list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+                        tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                        LASSERT (tx->tx_passive_rdma ||
+                                 !tx->tx_passive_rdma_wait);
+
+                        LASSERT (tx->tx_passive_rdma_wait ||
+                                 tx->tx_sending != 0);
+
+                        /* still waiting for tx callback? */
+                        if (!tx->tx_passive_rdma_wait)
+                                continue;
+
+                        tx->tx_status = -ECONNABORTED;
+                        tx->tx_passive_rdma_wait = 0;
+                        done = (tx->tx_sending == 0);
+
+                        if (!done)
+                                continue;
+
+                        list_del (&tx->tx_list);
+                        list_add (&tx->tx_list, &zombies);
+                }
+
+                /* grab all blocked transmits */
+                list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+                        tx = list_entry (tmp, kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+                        list_add (&tx->tx_list, &zombies);
+                }
+                
+                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+                while (!list_empty(&zombies)) {
+                        tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                        list_del(&tx->tx_list);
+                        kibnal_tx_done (tx);
+                }
                 break;
         }
 
@@ -1885,12 +1946,12 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event,
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                                tTS_IB_CM_COMM_ID cid,
                                void *param,
                                void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
         int          rc;
         
         switch (event) {
@@ -1903,12 +1964,12 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                 
                 CERROR ("Unexpected event %p -> "LPX64": %d\n", 
                         conn, conn->ibc_peer->ibp_nid, event);
-                koibnal_connreq_done (conn, 0, -ECONNABORTED);
+                kibnal_connreq_done (conn, 0, -ECONNABORTED);
                 break;
                 
         case TS_IB_CM_REQ_RECEIVED: {
                 struct ib_cm_req_received_param *req = param;
-                koib_wire_connreq_t             *wcr = req->remote_private_data;
+                kib_wire_connreq_t             *wcr = req->remote_private_data;
 
                 LASSERT (conn == NULL);
 
@@ -1920,23 +1981,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
 
-                if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) {
+                if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
                         CERROR ("Can't accept LID %04x: bad magic %08x\n",
                                 req->dlid, le32_to_cpu(wcr->wcr_magic));
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
                 
-                if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) {
+                if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
                         CERROR ("Can't accept LID %04x: bad version %d\n",
                                 req->dlid, le16_to_cpu(wcr->wcr_magic));
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
                                 
-                rc = koibnal_accept(&conn,
-                                    cid,
-                                    le64_to_cpu(wcr->wcr_nid),
-                                    le64_to_cpu(wcr->wcr_incarnation),
-                                    le16_to_cpu(wcr->wcr_queue_depth));
+                rc = kibnal_accept(&conn,
+                                   cid,
+                                   le64_to_cpu(wcr->wcr_nid),
+                                   le64_to_cpu(wcr->wcr_incarnation),
+                                   le16_to_cpu(wcr->wcr_queue_depth));
                 if (rc != 0) {
                         CERROR ("Can't accept "LPX64": %d\n",
                                 le64_to_cpu(wcr->wcr_nid), rc);
@@ -1945,23 +2006,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
 
                 /* update 'arg' for next callback */
                 rc = tsIbCmCallbackModify(cid, 
-                                          koibnal_passive_conn_callback, conn);
+                                          kibnal_passive_conn_callback, conn);
                 LASSERT (rc == 0);
 
                 req->accept_param.qp                     = conn->ibc_qp;
-                *((koib_wire_connreq_t *)req->accept_param.reply_private_data)
-                        = (koib_wire_connreq_t) {
-                                .wcr_magic       = cpu_to_le32(OPENIBNAL_MSG_MAGIC),
-                                .wcr_version     = cpu_to_le16(OPENIBNAL_MSG_VERSION),
-                                .wcr_queue_depth = cpu_to_le32(OPENIBNAL_MSG_QUEUE_SIZE),
-                                .wcr_nid         = cpu_to_le64(koibnal_data.koib_nid),
-                                .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation),
+                *((kib_wire_connreq_t *)req->accept_param.reply_private_data)
+                        = (kib_wire_connreq_t) {
+                                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                                .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+                                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
                         };
-                req->accept_param.reply_private_data_len = sizeof(koib_wire_connreq_t);
-                req->accept_param.responder_resources    = OPENIBNAL_RESPONDER_RESOURCES;
-                req->accept_param.initiator_depth        = OPENIBNAL_RESPONDER_RESOURCES;
-                req->accept_param.rnr_retry_count        = OPENIBNAL_RNR_RETRY;
-                req->accept_param.flow_control           = OPENIBNAL_FLOW_CONTROL;
+                req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t);
+                req->accept_param.responder_resources    = IBNAL_RESPONDER_RESOURCES;
+                req->accept_param.initiator_depth        = IBNAL_RESPONDER_RESOURCES;
+                req->accept_param.rnr_retry_count        = IBNAL_RNR_RETRY;
+                req->accept_param.flow_control           = IBNAL_FLOW_CONTROL;
 
                 CDEBUG(D_NET, "Proceeding\n");
                 break;
@@ -1972,60 +2033,60 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
                        conn, conn->ibc_peer->ibp_nid);
 
-                koibnal_connreq_done (conn, 0, 0);
+                kibnal_connreq_done (conn, 0, 0);
                 break;
         }
 
-        /* NB if the connreq is done, we switch to koibnal_conn_callback */
+        /* NB if the connreq is done, we switch to kibnal_conn_callback */
         return TS_IB_CM_CALLBACK_PROCEED;
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
                               tTS_IB_CM_COMM_ID cid,
                               void *param,
                               void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
 
         switch (event) {
         case TS_IB_CM_REP_RECEIVED: {
                 struct ib_cm_rep_received_param *rep = param;
-                koib_wire_connreq_t             *wcr = rep->remote_private_data;
+                kib_wire_connreq_t             *wcr = rep->remote_private_data;
 
                 if (rep->remote_private_data_len < sizeof (*wcr)) {
                         CERROR ("Short reply from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid,
                                 rep->remote_private_data_len);
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
 
-                if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) {
+                if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
                         CERROR ("Can't connect "LPX64": bad magic %08x\n",
                                 conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
                 
-                if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) {
+                if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
                         CERROR ("Can't connect "LPX64": bad version %d\n",
                                 conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
                                 
-                if (wcr->wcr_queue_depth != cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE)) {
+                if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
                         CERROR ("Can't connect "LPX64": bad queue depth %d\n",
                                 conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth));
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
                                 
                 if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
                         CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
                                 le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
 
@@ -2033,7 +2094,7 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
                        conn, conn->ibc_peer->ibp_nid);
 
                 conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
-                conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE;
+                conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
                 break;
         }
 
@@ -2041,86 +2102,86 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n",
                        conn, conn->ibc_peer->ibp_nid);
 
-                koibnal_connreq_done (conn, 1, 0);
+                kibnal_connreq_done (conn, 1, 0);
                 break;
 
         case TS_IB_CM_IDLE:
                 CERROR("Connection %p -> "LPX64" IDLE\n",
                        conn, conn->ibc_peer->ibp_nid);
                 /* Back out state change: I'm disengaged from CM */
-                conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
                 
-                koibnal_connreq_done (conn, 1, -ECONNABORTED);
+                kibnal_connreq_done (conn, 1, -ECONNABORTED);
                 break;
 
         default:
                 CERROR("Connection %p -> "LPX64" ERROR %d\n",
                        conn, conn->ibc_peer->ibp_nid, event);
-                koibnal_connreq_done (conn, 1, -ECONNABORTED);
+                kibnal_connreq_done (conn, 1, -ECONNABORTED);
                 break;
         }
 
-        /* NB if the connreq is done, we switch to koibnal_conn_callback */
+        /* NB if the connreq is done, we switch to kibnal_conn_callback */
         return TS_IB_CM_CALLBACK_PROCEED;
 }
 
 int
-koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
+kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
                           struct ib_path_record *resp, int remaining,
                           void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
         
         if (status != 0) {
                 CERROR ("status %d\n", status);
-                koibnal_connreq_done (conn, 1, status);
+                kibnal_connreq_done (conn, 1, status);
                 goto out;
         }
 
         conn->ibc_connreq->cr_path = *resp;
 
-        conn->ibc_connreq->cr_wcr = (koib_wire_connreq_t) {
-                .wcr_magic       = cpu_to_le32(OPENIBNAL_MSG_MAGIC),
-                .wcr_version     = cpu_to_le16(OPENIBNAL_MSG_VERSION),
-                .wcr_queue_depth = cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE),
-                .wcr_nid         = cpu_to_le64(koibnal_data.koib_nid),
-                .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation),
+        conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
         };
 
         conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) {
                 .qp                   = conn->ibc_qp,
                 .req_private_data     = &conn->ibc_connreq->cr_wcr,
                 .req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr),
-                .responder_resources  = OPENIBNAL_RESPONDER_RESOURCES,
-                .initiator_depth      = OPENIBNAL_RESPONDER_RESOURCES,
-                .retry_count          = OPENIBNAL_RETRY,
-                .rnr_retry_count      = OPENIBNAL_RNR_RETRY,
-                .cm_response_timeout  = koibnal_tunables.koib_io_timeout,
-                .max_cm_retries       = OPENIBNAL_CM_RETRY,
-                .flow_control         = OPENIBNAL_FLOW_CONTROL,
+                .responder_resources  = IBNAL_RESPONDER_RESOURCES,
+                .initiator_depth      = IBNAL_RESPONDER_RESOURCES,
+                .retry_count          = IBNAL_RETRY,
+                .rnr_retry_count      = IBNAL_RNR_RETRY,
+                .cm_response_timeout  = kibnal_tunables.kib_io_timeout,
+                .max_cm_retries       = IBNAL_CM_RETRY,
+                .flow_control         = IBNAL_FLOW_CONTROL,
         };
 
         /* XXX set timeout just like SDP!!!*/
         conn->ibc_connreq->cr_path.packet_life = 13;
         
         /* Flag I'm getting involved with the CM... */
-        conn->ibc_state = OPENIBNAL_CONN_CONNECTING;
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
 
         CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
                conn->ibc_connreq->cr_service.service_id, 
-               *koibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
 
-        /* koibnal_connect_callback gets my conn ref */
+        /* kibnal_connect_callback gets my conn ref */
         status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, 
                                 &conn->ibc_connreq->cr_path, NULL,
                                 conn->ibc_connreq->cr_service.service_id, 0,
-                                koibnal_active_conn_callback, conn,
+                                kibnal_active_conn_callback, conn,
                                 &conn->ibc_comm_id);
         if (status != 0) {
                 CERROR ("Connect: %d\n", status);
                 /* Back out state change: I've not got a CM comm_id yet... */
-                conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
-                koibnal_connreq_done (conn, 1, status);
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
+                kibnal_connreq_done (conn, 1, status);
         }
         
  out:
@@ -2129,58 +2190,58 @@ koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
 }
 
 void
-koibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
-                              struct ib_common_attrib_service *resp, void *arg)
+kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
+                             struct ib_common_attrib_service *resp, void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
         
         if (status != 0) {
                 CERROR ("status %d\n", status);
-                koibnal_connreq_done (conn, 1, status);
+                kibnal_connreq_done (conn, 1, status);
                 return;
         }
 
         CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
                status, resp->service_id, 
-               *koibnal_service_nid_field(resp));
+               *kibnal_service_nid_field(resp));
 
         conn->ibc_connreq->cr_service = *resp;
 
-        status = ib_cached_gid_get(koibnal_data.koib_device,
-                                   koibnal_data.koib_port, 0,
+        status = ib_cached_gid_get(kibnal_data.kib_device,
+                                   kibnal_data.kib_port, 0,
                                    conn->ibc_connreq->cr_gid);
         LASSERT (status == 0);
 
-        /* koibnal_pathreq_callback gets my conn ref */
-        status = tsIbPathRecordRequest (koibnal_data.koib_device,
-                                        koibnal_data.koib_port,
+        /* kibnal_pathreq_callback gets my conn ref */
+        status = tsIbPathRecordRequest (kibnal_data.kib_device,
+                                        kibnal_data.kib_port,
                                         conn->ibc_connreq->cr_gid,
                                         conn->ibc_connreq->cr_service.service_gid,
                                         conn->ibc_connreq->cr_service.service_pkey,
                                         0,
-                                        koibnal_tunables.koib_io_timeout * HZ,
+                                        kibnal_tunables.kib_io_timeout * HZ,
                                         0,
-                                        koibnal_pathreq_callback, conn, 
+                                        kibnal_pathreq_callback, conn, 
                                         &conn->ibc_connreq->cr_tid);
 
         if (status == 0)
                 return;
 
         CERROR ("Path record request: %d\n", status);
-        koibnal_connreq_done (conn, 1, status);
+        kibnal_connreq_done (conn, 1, status);
 }
 
 void
-koibnal_connect_peer (koib_peer_t *peer)
+kibnal_connect_peer (kib_peer_t *peer)
 {
-        koib_conn_t *conn = koibnal_create_conn();
+        kib_conn_t  *conn = kibnal_create_conn();
         int          rc;
 
         LASSERT (peer->ibp_connecting != 0);
 
         if (conn == NULL) {
                 CERROR ("Can't allocate conn\n");
-                koibnal_peer_connect_failed (peer, 1, -ENOMEM);
+                kibnal_peer_connect_failed (peer, 1, -ENOMEM);
                 return;
         }
 
@@ -2190,85 +2251,101 @@ koibnal_connect_peer (koib_peer_t *peer)
         PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
         if (conn->ibc_connreq == NULL) {
                 CERROR ("Can't allocate connreq\n");
-                koibnal_connreq_done (conn, 1, -ENOMEM);
+                kibnal_connreq_done (conn, 1, -ENOMEM);
                 return;
         }
 
         memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
 
-        koibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+        kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
 
-        /* koibnal_service_get_callback gets my conn ref */
-        rc = ib_service_get (koibnal_data.koib_device, 
-                             koibnal_data.koib_port,
+        /* kibnal_service_get_callback gets my conn ref */
+        rc = ib_service_get (kibnal_data.kib_device, 
+                             kibnal_data.kib_port,
                              &conn->ibc_connreq->cr_service,
-                             KOIBNAL_SERVICE_KEY_MASK,
-                             koibnal_tunables.koib_io_timeout * HZ,
-                             koibnal_service_get_callback, conn, 
+                             KIBNAL_SERVICE_KEY_MASK,
+                             kibnal_tunables.kib_io_timeout * HZ,
+                             kibnal_service_get_callback, conn, 
                              &conn->ibc_connreq->cr_tid);
         
         if (rc == 0)
                 return;
 
         CERROR ("ib_service_get: %d\n", rc);
-        koibnal_connreq_done (conn, 1, rc);
+        kibnal_connreq_done (conn, 1, rc);
 }
 
 int
-koibnal_conn_timed_out (koib_conn_t *conn)
+kibnal_conn_timed_out (kib_conn_t *conn)
 {
-        koib_tx_t         *tx;
+        kib_tx_t          *tx;
         struct list_head  *ttmp;
         unsigned long      flags;
-        int                rc = 0;
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
-        list_for_each (ttmp, &conn->ibc_rdma_queue) {
-                tx = list_entry (ttmp, koib_tx_t, tx_list);
+        list_for_each (ttmp, &conn->ibc_tx_queue) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
 
-                LASSERT (tx->tx_passive_rdma);
-                LASSERT (tx->tx_passive_rdma_wait);
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_sending == 0);
 
-                if (time_after_eq (jiffies, tx->tx_passive_rdma_deadline)) {
-                        rc = 1;
-                        break;
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
                 }
         }
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
         spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
-        return rc;
+        return 0;
 }
 
 void
-koibnal_check_conns (int idx)
+kibnal_check_conns (int idx)
 {
-        struct list_head  *peers = &koibnal_data.koib_peers[idx];
+        struct list_head  *peers = &kibnal_data.kib_peers[idx];
         struct list_head  *ptmp;
-        koib_peer_t       *peer;
-        koib_conn_t       *conn;
+        kib_peer_t        *peer;
+        kib_conn_t        *conn;
         struct list_head  *ctmp;
 
  again:
         /* NB. We expect to have a look at all the peers and not find any
          * rdmas to time out, so we just use a shared lock while we
          * take a look... */
-        read_lock (&koibnal_data.koib_global_lock);
+        read_lock (&kibnal_data.kib_global_lock);
 
         list_for_each (ptmp, peers) {
-                peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                peer = list_entry (ptmp, kib_peer_t, ibp_list);
 
                 list_for_each (ctmp, &peer->ibp_conns) {
-                        conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                        conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
 
-                        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED);
 
                         /* In case we have enough credits to return via a
                          * NOOP, but there were no non-blocking tx descs
                          * free to do it last time... */
-                        koibnal_check_sends(conn);
+                        kibnal_check_sends(conn);
 
-                        if (!koibnal_conn_timed_out(conn))
+                        if (!kibnal_conn_timed_out(conn))
                                 continue;
                         
                         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
@@ -2276,108 +2353,76 @@ koibnal_check_conns (int idx)
                                atomic_read (&conn->ibc_refcount));
 
                         atomic_inc (&conn->ibc_refcount);
-                        read_unlock (&koibnal_data.koib_global_lock);
+                        read_unlock (&kibnal_data.kib_global_lock);
 
                         CERROR("Timed out RDMA with "LPX64"\n",
                                peer->ibp_nid);
 
-                        koibnal_close_conn (conn, -ETIMEDOUT);
-                        koibnal_put_conn (conn);
+                        kibnal_close_conn (conn, -ETIMEDOUT);
+                        kibnal_put_conn (conn);
 
                         /* start again now I've dropped the lock */
                         goto again;
                 }
         }
 
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
 }
 
 void
-koibnal_terminate_conn (koib_conn_t *conn)
+kibnal_terminate_conn (kib_conn_t *conn)
 {
-        unsigned long flags;
         int           rc;
-        int           done;
 
         CDEBUG(D_NET, "conn %p\n", conn);
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_DEATHROW);
-        conn->ibc_state = OPENIBNAL_CONN_ZOMBIE;
+        LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW);
+        conn->ibc_state = IBNAL_CONN_ZOMBIE;
 
         rc = ib_cm_disconnect (conn->ibc_comm_id);
         if (rc != 0)
                 CERROR ("Error %d disconnecting conn %p -> "LPX64"\n",
                         rc, conn, conn->ibc_peer->ibp_nid);
-
-        /* complete blocked passive RDMAs */
-        spin_lock_irqsave (&conn->ibc_lock, flags);
-        
-        while (!list_empty (&conn->ibc_rdma_queue)) {
-                koib_tx_t *tx = list_entry (conn->ibc_rdma_queue.next,
-                                            koib_tx_t, tx_list);
-
-                LASSERT (tx->tx_passive_rdma);
-                LASSERT (tx->tx_passive_rdma_wait);
-                
-                list_del (&tx->tx_list);
-
-                tx->tx_passive_rdma_wait = 0;
-                done = (tx->tx_sending == 0);
-                
-                tx->tx_status = -ECONNABORTED;
-
-                spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-                if (done)
-                        koibnal_tx_done (tx);
-
-                spin_lock_irqsave (&conn->ibc_lock, flags);
-        }
-        
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-        /* Complete all blocked transmits */
-        koibnal_check_sends(conn);
 }
 
 int
-koibnal_connd (void *arg)
+kibnal_connd (void *arg)
 {
         wait_queue_t       wait;
         unsigned long      flags;
-        koib_conn_t       *conn;
-        koib_peer_t       *peer;
+        kib_conn_t        *conn;
+        kib_peer_t        *peer;
         int                timeout;
         int                i;
         int                peer_index = 0;
         unsigned long      deadline = jiffies;
         
-        kportal_daemonize ("koibnal_connd");
+        kportal_daemonize ("kibnal_connd");
         kportal_blockallsigs ();
 
         init_waitqueue_entry (&wait, current);
 
-        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
 
         for (;;) {
-                if (!list_empty (&koibnal_data.koib_connd_conns)) {
-                        conn = list_entry (koibnal_data.koib_connd_conns.next,
-                                           koib_conn_t, ibc_list);
+                if (!list_empty (&kibnal_data.kib_connd_conns)) {
+                        conn = list_entry (kibnal_data.kib_connd_conns.next,
+                                           kib_conn_t, ibc_list);
                         list_del (&conn->ibc_list);
                         
-                        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                         switch (conn->ibc_state) {
-                        case OPENIBNAL_CONN_DEATHROW:
+                        case IBNAL_CONN_DEATHROW:
                                 LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID);
                                 /* Disconnect: conn becomes a zombie in the
                                  * callback and last ref reschedules it
                                  * here... */
-                                koibnal_terminate_conn(conn);
-                                koibnal_put_conn (conn);
+                                kibnal_terminate_conn(conn);
+                                kibnal_put_conn (conn);
                                 break;
                                 
-                        case OPENIBNAL_CONN_ZOMBIE:
-                                koibnal_destroy_conn (conn);
+                        case IBNAL_CONN_ZOMBIE:
+                                kibnal_destroy_conn (conn);
                                 break;
                                 
                         default:
@@ -2386,35 +2431,35 @@ koibnal_connd (void *arg)
                                 LBUG();
                         }
 
-                        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
                         continue;
                 }
 
-                if (!list_empty (&koibnal_data.koib_connd_peers)) {
-                        peer = list_entry (koibnal_data.koib_connd_peers.next,
-                                           koib_peer_t, ibp_connd_list);
+                if (!list_empty (&kibnal_data.kib_connd_peers)) {
+                        peer = list_entry (kibnal_data.kib_connd_peers.next,
+                                           kib_peer_t, ibp_connd_list);
                         
                         list_del_init (&peer->ibp_connd_list);
-                        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
-                        koibnal_connect_peer (peer);
-                        koibnal_put_peer (peer);
+                        kibnal_connect_peer (peer);
+                        kibnal_put_peer (peer);
 
-                        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
                 }
 
                 /* shut down and nobody left to reap... */
-                if (koibnal_data.koib_shutdown &&
-                    atomic_read(&koibnal_data.koib_nconns) == 0)
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
                         break;
 
-                spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                 /* careful with the jiffy wrap... */
                 while ((timeout = (int)(deadline - jiffies)) <= 0) {
                         const int n = 4;
                         const int p = 1;
-                        int       chunk = koibnal_data.koib_peer_hash_size;
+                        int       chunk = kibnal_data.kib_peer_hash_size;
                         
                         /* Time to check for RDMA timeouts on a few more
                          * peers: I do checks every 'p' seconds on a
@@ -2424,129 +2469,129 @@ koibnal_connd (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-                        if (koibnal_tunables.koib_io_timeout > n * p)
+                        if (kibnal_tunables.kib_io_timeout > n * p)
                                 chunk = (chunk * n * p) / 
-                                        koibnal_tunables.koib_io_timeout;
+                                        kibnal_tunables.kib_io_timeout;
                         if (chunk == 0)
                                 chunk = 1;
 
                         for (i = 0; i < chunk; i++) {
-                                koibnal_check_conns (peer_index);
+                                kibnal_check_conns (peer_index);
                                 peer_index = (peer_index + 1) % 
-                                             koibnal_data.koib_peer_hash_size;
+                                             kibnal_data.kib_peer_hash_size;
                         }
 
                         deadline += p * HZ;
                 }
 
-                koibnal_data.koib_connd_waketime = jiffies + timeout;
+                kibnal_data.kib_connd_waketime = jiffies + timeout;
 
                 set_current_state (TASK_INTERRUPTIBLE);
-                add_wait_queue (&koibnal_data.koib_connd_waitq, &wait);
+                add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
 
-                if (!koibnal_data.koib_shutdown &&
-                    list_empty (&koibnal_data.koib_connd_conns) &&
-                    list_empty (&koibnal_data.koib_connd_peers))
+                if (!kibnal_data.kib_shutdown &&
+                    list_empty (&kibnal_data.kib_connd_conns) &&
+                    list_empty (&kibnal_data.kib_connd_peers))
                         schedule_timeout (timeout);
 
                 set_current_state (TASK_RUNNING);
-                remove_wait_queue (&koibnal_data.koib_connd_waitq, &wait);
+                remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
 
-                spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+                spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
         }
 
-        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
-        koibnal_thread_fini ();
+        kibnal_thread_fini ();
         return (0);
 }
 
 int
-koibnal_scheduler(void *arg)
+kibnal_scheduler(void *arg)
 {
         long            id = (long)arg;
         char            name[16];
-        koib_rx_t      *rx;
-        koib_tx_t      *tx;
+        kib_rx_t       *rx;
+        kib_tx_t       *tx;
         unsigned long   flags;
         int             rc;
         int             counter = 0;
         int             did_something;
 
-        snprintf(name, sizeof(name), "koibnal_sd_%02ld", id);
+        snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
         kportal_daemonize(name);
         kportal_blockallsigs();
 
-        spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags);
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
 
         for (;;) {
                 did_something = 0;
 
-                while (!list_empty(&koibnal_data.koib_sched_txq)) {
-                        tx = list_entry(koibnal_data.koib_sched_txq.next,
-                                        koib_tx_t, tx_list);
+                while (!list_empty(&kibnal_data.kib_sched_txq)) {
+                        tx = list_entry(kibnal_data.kib_sched_txq.next,
+                                        kib_tx_t, tx_list);
                         list_del(&tx->tx_list);
-                        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
-                        koibnal_tx_done(tx);
+                        kibnal_tx_done(tx);
 
-                        spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
                                           flags);
                 }
 
-                if (!list_empty(&koibnal_data.koib_sched_rxq)) {
-                        rx = list_entry(koibnal_data.koib_sched_rxq.next,
-                                        koib_rx_t, rx_list);
+                if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+                        rx = list_entry(kibnal_data.kib_sched_rxq.next,
+                                        kib_rx_t, rx_list);
                         list_del(&rx->rx_list);
-                        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
 
-                        koibnal_rx(rx);
+                        kibnal_rx(rx);
 
                         did_something = 1;
-                        spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
                                           flags);
                 }
 
                 /* shut down and no receives to complete... */
-                if (koibnal_data.koib_shutdown &&
-                    atomic_read(&koibnal_data.koib_nconns) == 0)
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
                         break;
 
                 /* nothing to do or hogging CPU */
-                if (!did_something || counter++ == OPENIBNAL_RESCHED) {
-                        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+                if (!did_something || counter++ == IBNAL_RESCHED) {
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
                         counter = 0;
 
                         if (!did_something) {
                                 rc = wait_event_interruptible(
-                                        koibnal_data.koib_sched_waitq,
-                                        !list_empty(&koibnal_data.koib_sched_txq) || 
-                                        !list_empty(&koibnal_data.koib_sched_rxq) || 
-                                        (koibnal_data.koib_shutdown &&
-                                         atomic_read (&koibnal_data.koib_nconns) == 0));
+                                        kibnal_data.kib_sched_waitq,
+                                        !list_empty(&kibnal_data.kib_sched_txq) || 
+                                        !list_empty(&kibnal_data.kib_sched_rxq) || 
+                                        (kibnal_data.kib_shutdown &&
+                                         atomic_read (&kibnal_data.kib_nconns) == 0));
                         } else {
                                 our_cond_resched();
                         }
 
-                        spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
                                           flags);
                 }
         }
 
-        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
 
-        koibnal_thread_fini();
+        kibnal_thread_fini();
         return (0);
 }
 
 
-lib_nal_t koibnal_lib = {
-        libnal_data:        &koibnal_data,      /* NAL private data */
-        libnal_send:         koibnal_send,
-        libnal_send_pages:   koibnal_send_pages,
-        libnal_recv:         koibnal_recv,
-        libnal_recv_pages:   koibnal_recv_pages,
-        libnal_dist:         koibnal_dist
+lib_nal_t kibnal_lib = {
+        libnal_data:        &kibnal_data,      /* NAL private data */
+        libnal_send:         kibnal_send,
+        libnal_send_pages:   kibnal_send_pages,
+        libnal_recv:         kibnal_recv,
+        libnal_recv_pages:   kibnal_recv_pages,
+        libnal_dist:         kibnal_dist
 };
index 60d09c8..d27240c 100644 (file)
@@ -1,6 +1,6 @@
 MODULES := kqswnal
 kqswnal-objs := qswnal.o qswnal_cb.o
 
-EXTRA_PRE_CFLAGS := @QSWCPPFLAGS@ -I/usr/include
+EXTRA_POST_CFLAGS := @QSWCPPFLAGS@ -I/usr/include
 
 @INCLUDE_RULES@
index e7691a0..5aff4e9 100644 (file)
 
 #include "qswnal.h"
 
-ptl_handle_ni_t                kqswnal_ni;
 nal_t                  kqswnal_api;
 kqswnal_data_t         kqswnal_data;
+ptl_handle_ni_t         kqswnal_ni;
+kqswnal_tunables_t      kqswnal_tunables;
 
 kpr_nal_interface_t kqswnal_router_interface = {
        kprni_nalid:    QSWNAL,
@@ -39,14 +40,14 @@ kpr_nal_interface_t kqswnal_router_interface = {
 #define QSWNAL_SYSCTL  201
 
 #define QSWNAL_SYSCTL_OPTIMIZED_GETS     1
-#define QSWNAL_SYSCTL_COPY_SMALL_FWD     2
+#define QSWNAL_SYSCTL_OPTIMIZED_PUTS     2
 
 static ctl_table kqswnal_ctl_table[] = {
-       {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets",
-        &kqswnal_data.kqn_optimized_gets, sizeof (int),
+       {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts",
+        &kqswnal_tunables.kqn_optimized_puts, sizeof (int),
         0644, NULL, &proc_dointvec},
-       {QSWNAL_SYSCTL_COPY_SMALL_FWD, "copy_small_fwd",
-        &kqswnal_data.kqn_copy_small_fwd, sizeof (int),
+       {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets",
+        &kqswnal_tunables.kqn_optimized_gets, sizeof (int),
         0644, NULL, &proc_dointvec},
        {0}
 };
@@ -57,88 +58,13 @@ static ctl_table kqswnal_top_ctl_table[] = {
 };
 #endif
 
-static int
-kqswnal_forward(nal_t   *nal,
-               int     id,
-               void    *args,  size_t args_len,
-               void    *ret,   size_t ret_len)
-{
-       kqswnal_data_t *k = nal->nal_data;
-       nal_cb_t       *nal_cb = k->kqn_cb;
-
-       LASSERT (nal == &kqswnal_api);
-       LASSERT (k == &kqswnal_data);
-       LASSERT (nal_cb == &kqswnal_lib);
-
-       lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
-       return (PTL_OK);
-}
-
-static void
-kqswnal_lock (nal_t *nal, unsigned long *flags)
-{
-       kqswnal_data_t *k = nal->nal_data;
-       nal_cb_t       *nal_cb = k->kqn_cb;
-
-       LASSERT (nal == &kqswnal_api);
-       LASSERT (k == &kqswnal_data);
-       LASSERT (nal_cb == &kqswnal_lib);
-
-       nal_cb->cb_cli(nal_cb,flags);
-}
-
-static void
-kqswnal_unlock(nal_t *nal, unsigned long *flags)
-{
-       kqswnal_data_t *k = nal->nal_data;
-       nal_cb_t       *nal_cb = k->kqn_cb;
-
-       LASSERT (nal == &kqswnal_api);
-       LASSERT (k == &kqswnal_data);
-       LASSERT (nal_cb == &kqswnal_lib);
-
-       nal_cb->cb_sti(nal_cb,flags);
-}
-
-static int
-kqswnal_shutdown(nal_t *nal, int ni)
-{
-       CDEBUG (D_NET, "shutdown\n");
-
-       LASSERT (nal == &kqswnal_api);
-       return (0);
-}
-
-static void
-kqswnal_yield( nal_t *nal )
-{
-       CDEBUG (D_NET, "yield\n");
-
-       if (need_resched())
-               schedule();
-       return;
-}
-
-static nal_t *
-kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
-            ptl_pid_t requested_pid)
-{
-       ptl_nid_t mynid = kqswnal_elanid2nid (kqswnal_data.kqn_elanid);
-       int       nnids = kqswnal_data.kqn_nnodes;
-
-        CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid, nnids);
-
-       lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size);
-
-       return (&kqswnal_api);
-}
-
 int
 kqswnal_get_tx_desc (struct portals_cfg *pcfg)
 {
        unsigned long      flags;
        struct list_head  *tmp;
        kqswnal_tx_t      *ktx;
+       ptl_hdr_t         *hdr;
        int                index = pcfg->pcfg_count;
        int                rc = -ENOENT;
 
@@ -149,11 +75,12 @@ kqswnal_get_tx_desc (struct portals_cfg *pcfg)
                        continue;
 
                ktx = list_entry (tmp, kqswnal_tx_t, ktx_list);
+               hdr = (ptl_hdr_t *)ktx->ktx_buffer;
 
                pcfg->pcfg_pbuf1 = (char *)ktx;
-               pcfg->pcfg_count = NTOH__u32(ktx->ktx_wire_hdr->type);
-               pcfg->pcfg_size  = NTOH__u32(ktx->ktx_wire_hdr->payload_length);
-               pcfg->pcfg_nid   = NTOH__u64(ktx->ktx_wire_hdr->dest_nid);
+               pcfg->pcfg_count = le32_to_cpu(hdr->type);
+               pcfg->pcfg_size  = le32_to_cpu(hdr->payload_length);
+               pcfg->pcfg_nid   = le64_to_cpu(hdr->dest_nid);
                pcfg->pcfg_nid2  = ktx->ktx_nid;
                pcfg->pcfg_misc  = ktx->ktx_launcher;
                pcfg->pcfg_flags = (list_empty (&ktx->ktx_delayed_list) ? 0 : 1) |
@@ -182,7 +109,7 @@ kqswnal_cmd (struct portals_cfg *pcfg, void *private)
                        kqswnal_data.kqn_nid_offset);
                kqswnal_data.kqn_nid_offset =
                        pcfg->pcfg_nid - kqswnal_data.kqn_elanid;
-               kqswnal_lib.ni.nid = pcfg->pcfg_nid;
+               kqswnal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid;
                return (0);
                
        default:
@@ -190,11 +117,22 @@ kqswnal_cmd (struct portals_cfg *pcfg, void *private)
        }
 }
 
-void __exit
-kqswnal_finalise (void)
+static void
+kqswnal_shutdown(nal_t *nal)
 {
+       unsigned long flags;
        kqswnal_tx_t *ktx;
        kqswnal_rx_t *krx;
+       int           do_lib_fini = 0;
+
+       /* NB The first ref was this module! */
+       if (nal->nal_refct != 0) {
+               PORTAL_MODULE_UNUSE;
+               return;
+       }
+
+       CDEBUG (D_NET, "shutdown\n");
+       LASSERT (nal == &kqswnal_api);
 
        switch (kqswnal_data.kqn_init)
        {
@@ -202,17 +140,11 @@ kqswnal_finalise (void)
                LASSERT (0);
 
        case KQN_INIT_ALL:
-#if CONFIG_SYSCTL
-                if (kqswnal_data.kqn_sysctl != NULL)
-                        unregister_sysctl_table (kqswnal_data.kqn_sysctl);
-#endif         
-               PORTAL_SYMBOL_UNREGISTER (kqswnal_ni);
-                kportal_nal_unregister(QSWNAL);
+                libcfs_nal_cmd_unregister(QSWNAL);
                /* fall through */
 
-       case KQN_INIT_PTL:
-               PtlNIFini (kqswnal_ni);
-               lib_fini (&kqswnal_lib);
+       case KQN_INIT_LIB:
+               do_lib_fini = 1;
                /* fall through */
 
        case KQN_INIT_DATA:
@@ -223,18 +155,24 @@ kqswnal_finalise (void)
        }
 
        /**********************************************************************/
-       /* Make router stop her calling me and fail any more call-ins */
+       /* Tell router we're shutting down.  Any router calls my threads
+        * make will now fail immediately and the router will stop calling
+        * into me. */
        kpr_shutdown (&kqswnal_data.kqn_router);
-
+       
        /**********************************************************************/
-       /* flag threads we've started to terminate and wait for all to ack */
-
+       /* Signal the start of shutdown... */
+       spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
        kqswnal_data.kqn_shuttingdown = 1;
-       wake_up_all (&kqswnal_data.kqn_sched_waitq);
+       spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
 
-       while (atomic_read (&kqswnal_data.kqn_nthreads_running) != 0) {
-               CDEBUG(D_NET, "waiting for %d threads to start shutting down\n",
-                      atomic_read (&kqswnal_data.kqn_nthreads_running));
+       wake_up_all(&kqswnal_data.kqn_idletxd_waitq);
+
+       /**********************************************************************/
+       /* wait for sends that have allocated a tx desc to launch or give up */
+       while (atomic_read (&kqswnal_data.kqn_pending_txs) != 0) {
+               CDEBUG(D_NET, "waiting for %d pending sends\n",
+                      atomic_read (&kqswnal_data.kqn_pending_txs));
                set_current_state (TASK_UNINTERRUPTIBLE);
                schedule_timeout (HZ);
        }
@@ -242,18 +180,27 @@ kqswnal_finalise (void)
        /**********************************************************************/
        /* close elan comms */
 #if MULTIRAIL_EKC
+       /* Shut down receivers first; rx callbacks might try sending... */
        if (kqswnal_data.kqn_eprx_small != NULL)
                ep_free_rcvr (kqswnal_data.kqn_eprx_small);
 
        if (kqswnal_data.kqn_eprx_large != NULL)
                ep_free_rcvr (kqswnal_data.kqn_eprx_large);
 
+       /* NB ep_free_rcvr() returns only after we've freed off all receive
+        * buffers (see shutdown handling in kqswnal_requeue_rx()).  This
+        * means we must have completed any messages we passed to
+        * lib_parse() or kpr_fwd_start(). */
+
        if (kqswnal_data.kqn_eptx != NULL)
                ep_free_xmtr (kqswnal_data.kqn_eptx);
 
-       /* freeing the xmtr completes all txs pdq */
+       /* NB ep_free_xmtr() returns only after all outstanding transmits
+        * have called their callback... */
        LASSERT(list_empty(&kqswnal_data.kqn_activetxds));
 #else
+       /* "Old" EKC just pretends to shutdown cleanly but actually
+        * provides no guarantees */
        if (kqswnal_data.kqn_eprx_small != NULL)
                ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small);
 
@@ -272,7 +219,6 @@ kqswnal_finalise (void)
 #endif
        /**********************************************************************/
        /* flag threads to terminate, wake them and wait for them to die */
-
        kqswnal_data.kqn_shuttingdown = 2;
        wake_up_all (&kqswnal_data.kqn_sched_waitq);
 
@@ -290,10 +236,12 @@ kqswnal_finalise (void)
 
 #if MULTIRAIL_EKC
        LASSERT (list_empty (&kqswnal_data.kqn_readyrxds));
+       LASSERT (list_empty (&kqswnal_data.kqn_delayedtxds));
+       LASSERT (list_empty (&kqswnal_data.kqn_delayedfwds));
 #endif
 
        /**********************************************************************/
-       /* Complete any blocked forwarding packets with error
+       /* Complete any blocked forwarding packets, with error
         */
 
        while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq))
@@ -301,23 +249,16 @@ kqswnal_finalise (void)
                kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
                                                  kpr_fwd_desc_t, kprfd_list);
                list_del (&fwd->kprfd_list);
-               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
-       }
-
-       while (!list_empty (&kqswnal_data.kqn_delayedfwds))
-       {
-               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next,
-                                                 kpr_fwd_desc_t, kprfd_list);
-               list_del (&fwd->kprfd_list);
-               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
+               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -ESHUTDOWN);
        }
 
        /**********************************************************************/
-       /* Wait for router to complete any packets I sent her
-        */
+       /* finalise router and portals lib */
 
        kpr_deregister (&kqswnal_data.kqn_router);
 
+       if (do_lib_fini)
+               lib_fini (&kqswnal_lib);
 
        /**********************************************************************/
        /* Unmap message buffers and free all descriptors and buffers
@@ -328,7 +269,7 @@ kqswnal_finalise (void)
         * ep_dvma_release() get fixed (and releases any mappings in the
         * region), we can delete all the code from here -------->  */
 
-       for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx =ktx->ktx_alloclist){
+       for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) {
                /* If ktx has a buffer, it got mapped; unmap now.  NB only
                 * the pre-mapped stuff is still mapped since all tx descs
                 * must be idle */
@@ -339,8 +280,8 @@ kqswnal_finalise (void)
                                       &ktx->ktx_ebuffer);
        }
 
-       for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx =krx->krx_alloclist){
-               /* If krx_kiov[0].kiov_page got allocated, it got mapped.
+       for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
+               /* If krx_kiov[0].kiov_page got allocated, it got mapped.  
                 * NB subsequent pages get merged */
 
                if (krx->krx_kiov[0].kiov_page != NULL)
@@ -351,10 +292,10 @@ kqswnal_finalise (void)
        /* <----------- to here */
 
        if (kqswnal_data.kqn_ep_rx_nmh != NULL)
-               ep_dvma_release(kqswnal_data.kqn_ep,kqswnal_data.kqn_ep_rx_nmh);
+               ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh);
 
        if (kqswnal_data.kqn_ep_tx_nmh != NULL)
-               ep_dvma_release(kqswnal_data.kqn_ep,kqswnal_data.kqn_ep_tx_nmh);
+               ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh);
 #else
        if (kqswnal_data.kqn_eprxdmahandle != NULL)
        {
@@ -410,8 +351,10 @@ kqswnal_finalise (void)
                 atomic_read(&portal_kmemory));
 }
 
-static int __init
-kqswnal_initialise (void)
+static int
+kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
+                ptl_ni_limits_t *requested_limits, 
+                ptl_ni_limits_t *actual_limits)
 {
 #if MULTIRAIL_EKC
        EP_RAILMASK       all_rails = EP_RAILMASK_ALL;
@@ -423,37 +366,26 @@ kqswnal_initialise (void)
        kqswnal_rx_t     *krx;
        kqswnal_tx_t     *ktx;
        int               elan_page_idx;
+       ptl_process_id_t  my_process_id;
        int               pkmem = atomic_read(&portal_kmemory);
 
-       LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING);
+       LASSERT (nal == &kqswnal_api);
 
-       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory));
+       if (nal->nal_refct != 0) {
+               if (actual_limits != NULL)
+                       *actual_limits = kqswnal_lib.libnal_ni.ni_actual_limits;
+               /* This module got the first ref */
+               PORTAL_MODULE_USE;
+               return (PTL_OK);
+       }
 
-       kqswnal_api.forward  = kqswnal_forward;
-       kqswnal_api.shutdown = kqswnal_shutdown;
-       kqswnal_api.yield    = kqswnal_yield;
-       kqswnal_api.validate = NULL;            /* our api validate is a NOOP */
-       kqswnal_api.lock     = kqswnal_lock;
-       kqswnal_api.unlock   = kqswnal_unlock;
-       kqswnal_api.nal_data = &kqswnal_data;
+       LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING);
 
-       kqswnal_lib.nal_data = &kqswnal_data;
+       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory));
 
-       memset(&kqswnal_rpc_success, 0, sizeof(kqswnal_rpc_success));
-       memset(&kqswnal_rpc_failed, 0, sizeof(kqswnal_rpc_failed));
-#if MULTIRAIL_EKC
-       kqswnal_rpc_failed.Data[0] = -ECONNREFUSED;
-#else
-       kqswnal_rpc_failed.Status = -ECONNREFUSED;
-#endif
        /* ensure all pointers NULL etc */
        memset (&kqswnal_data, 0, sizeof (kqswnal_data));
 
-       kqswnal_data.kqn_optimized_gets = KQSW_OPTIMIZED_GETS;
-       kqswnal_data.kqn_copy_small_fwd = KQSW_COPY_SMALL_FWD;
-
-       kqswnal_data.kqn_cb = &kqswnal_lib;
-
        INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
        INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds);
        INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds);
@@ -468,22 +400,28 @@ kqswnal_initialise (void)
        spin_lock_init (&kqswnal_data.kqn_sched_lock);
        init_waitqueue_head (&kqswnal_data.kqn_sched_waitq);
 
-       spin_lock_init (&kqswnal_data.kqn_statelock);
+       /* Leave kqn_rpc_success zeroed */
+#if MULTIRAIL_EKC
+       kqswnal_data.kqn_rpc_failed.Data[0] = -ECONNREFUSED;
+#else
+       kqswnal_data.kqn_rpc_failed.Status = -ECONNREFUSED;
+#endif
 
        /* pointers/lists/locks initialised */
        kqswnal_data.kqn_init = KQN_INIT_DATA;
-
+       
 #if MULTIRAIL_EKC
        kqswnal_data.kqn_ep = ep_system();
        if (kqswnal_data.kqn_ep == NULL) {
                CERROR("Can't initialise EKC\n");
-               return (-ENODEV);
+               kqswnal_shutdown(nal);
+               return (PTL_IFACE_INVALID);
        }
 
        if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) {
                CERROR("Can't get elan ID\n");
-               kqswnal_finalise();
-               return (-ENODEV);
+               kqswnal_shutdown(nal);
+               return (PTL_IFACE_INVALID);
        }
 #else
        /**********************************************************************/
@@ -493,7 +431,8 @@ kqswnal_initialise (void)
        if (kqswnal_data.kqn_ep == NULL)
        {
                CERROR ("Can't get elan device 0\n");
-               return (-ENODEV);
+               kqswnal_shutdown(nal);
+               return (PTL_IFACE_INVALID);
        }
 #endif
 
@@ -508,8 +447,8 @@ kqswnal_initialise (void)
        if (kqswnal_data.kqn_eptx == NULL)
        {
                CERROR ("Can't allocate transmitter\n");
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               kqswnal_shutdown (nal);
+               return (PTL_NO_SPACE);
        }
 
        /**********************************************************************/
@@ -521,8 +460,8 @@ kqswnal_initialise (void)
        if (kqswnal_data.kqn_eprx_small == NULL)
        {
                CERROR ("Can't install small msg receiver\n");
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               kqswnal_shutdown (nal);
+               return (PTL_NO_SPACE);
        }
 
        kqswnal_data.kqn_eprx_large = ep_alloc_rcvr (kqswnal_data.kqn_ep,
@@ -531,8 +470,8 @@ kqswnal_initialise (void)
        if (kqswnal_data.kqn_eprx_large == NULL)
        {
                CERROR ("Can't install large msg receiver\n");
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               kqswnal_shutdown (nal);
+               return (PTL_NO_SPACE);
        }
 
        /**********************************************************************/
@@ -546,8 +485,8 @@ kqswnal_initialise (void)
                                EP_PERM_WRITE);
        if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
                CERROR("Can't reserve tx dma space\n");
-               kqswnal_finalise();
-               return (-ENOMEM);
+               kqswnal_shutdown(nal);
+               return (PTL_NO_SPACE);
        }
 #else
         dmareq.Waitfn   = DDI_DMA_SLEEP;
@@ -561,8 +500,8 @@ kqswnal_initialise (void)
        if (rc != DDI_SUCCESS)
        {
                CERROR ("Can't reserve rx dma space\n");
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               kqswnal_shutdown (nal);
+               return (PTL_NO_SPACE);
        }
 #endif
        /**********************************************************************/
@@ -575,8 +514,8 @@ kqswnal_initialise (void)
                                EP_PERM_WRITE);
        if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
                CERROR("Can't reserve rx dma space\n");
-               kqswnal_finalise();
-               return (-ENOMEM);
+               kqswnal_shutdown(nal);
+               return (PTL_NO_SPACE);
        }
 #else
         dmareq.Waitfn   = DDI_DMA_SLEEP;
@@ -591,8 +530,8 @@ kqswnal_initialise (void)
        if (rc != DDI_SUCCESS)
        {
                CERROR ("Can't reserve rx dma space\n");
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               kqswnal_shutdown (nal);
+               return (PTL_NO_SPACE);
        }
 #endif
        /**********************************************************************/
@@ -606,18 +545,19 @@ kqswnal_initialise (void)
 
                PORTAL_ALLOC (ktx, sizeof(*ktx));
                if (ktx == NULL) {
-                       kqswnal_finalise ();
-                       return (-ENOMEM);
+                       kqswnal_shutdown (nal);
+                       return (PTL_NO_SPACE);
                }
 
+               memset(ktx, 0, sizeof(*ktx));   /* NULL pointers; zero flags */
                ktx->ktx_alloclist = kqswnal_data.kqn_txds;
                kqswnal_data.kqn_txds = ktx;
 
                PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
                if (ktx->ktx_buffer == NULL)
                {
-                       kqswnal_finalise ();
-                       return (-ENOMEM);
+                       kqswnal_shutdown (nal);
+                       return (PTL_NO_SPACE);
                }
 
                /* Map pre-allocated buffer NOW, to save latency on transmit */
@@ -640,6 +580,9 @@ kqswnal_initialise (void)
                INIT_LIST_HEAD (&ktx->ktx_delayed_list);
 
                ktx->ktx_state = KTX_IDLE;
+#if MULTIRAIL_EKC
+               ktx->ktx_rail = -1;             /* unset rail */
+#endif
                ktx->ktx_isnblk = (i >= KQSW_NTXMSGS);
                list_add_tail (&ktx->ktx_list, 
                               ktx->ktx_isnblk ? &kqswnal_data.kqn_nblk_idletxds :
@@ -648,7 +591,6 @@ kqswnal_initialise (void)
 
        /**********************************************************************/
        /* Allocate/Initialise receive descriptors */
-
        kqswnal_data.kqn_rxds = NULL;
        elan_page_idx = 0;
        for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
@@ -662,10 +604,11 @@ kqswnal_initialise (void)
 
                PORTAL_ALLOC(krx, sizeof(*krx));
                if (krx == NULL) {
-                       kqswnal_finalise();
-                       return (-ENOSPC);
+                       kqswnal_shutdown(nal);
+                       return (PTL_NO_SPACE);
                }
 
+               memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
                krx->krx_alloclist = kqswnal_data.kqn_rxds;
                kqswnal_data.kqn_rxds = krx;
 
@@ -686,8 +629,8 @@ kqswnal_initialise (void)
                        struct page *page = alloc_page(GFP_KERNEL);
                        
                        if (page == NULL) {
-                               kqswnal_finalise ();
-                               return (-ENOMEM);
+                               kqswnal_shutdown (nal);
+                               return (PTL_NO_SPACE);
                        }
 
                        krx->krx_kiov[j].kiov_page = page;
@@ -731,21 +674,26 @@ kqswnal_initialise (void)
        /**********************************************************************/
        /* Network interface ready to initialise */
 
-        rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni);
-        if (rc != 0)
+       my_process_id.nid = kqswnal_elanid2nid(kqswnal_data.kqn_elanid);
+       my_process_id.pid = requested_pid;
+
+       rc = lib_init(&kqswnal_lib, nal, my_process_id,
+                     requested_limits, actual_limits);
+        if (rc != PTL_OK)
        {
-               CERROR ("PtlNIInit failed %d\n", rc);
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               CERROR ("lib_init failed %d\n", rc);
+               kqswnal_shutdown (nal);
+               return (rc);
        }
 
-       kqswnal_data.kqn_init = KQN_INIT_PTL;
+       kqswnal_data.kqn_init = KQN_INIT_LIB;
 
        /**********************************************************************/
        /* Queue receives, now that it's OK to run their completion callbacks */
 
-       for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx =krx->krx_alloclist){
+       for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
                /* NB this enqueue can allocate/sleep (attr == 0) */
+               krx->krx_state = KRX_POSTED;
 #if MULTIRAIL_EKC
                rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
                                      &krx->krx_elanbuffer, 0);
@@ -757,8 +705,8 @@ kqswnal_initialise (void)
                if (rc != EP_SUCCESS)
                {
                        CERROR ("failed ep_queue_receive %d\n", rc);
-                       kqswnal_finalise ();
-                       return (-ENOMEM);
+                       kqswnal_shutdown (nal);
+                       return (PTL_FAIL);
                }
        }
 
@@ -769,8 +717,8 @@ kqswnal_initialise (void)
                if (rc != 0)
                {
                        CERROR ("failed to spawn scheduling thread: %d\n", rc);
-                       kqswnal_finalise ();
-                       return (rc);
+                       kqswnal_shutdown (nal);
+                       return (PTL_FAIL);
                }
        }
 
@@ -779,19 +727,13 @@ kqswnal_initialise (void)
        rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface);
        CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc);
 
-       rc = kportal_nal_register (QSWNAL, &kqswnal_cmd, NULL);
+       rc = libcfs_nal_cmd_register (QSWNAL, &kqswnal_cmd, NULL);
        if (rc != 0) {
                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
-               kqswnal_finalise ();
-               return (rc);
+               kqswnal_shutdown (nal);
+               return (PTL_FAIL);
        }
 
-#if CONFIG_SYSCTL
-        /* Press on regardless even if registering sysctl doesn't work */
-        kqswnal_data.kqn_sysctl = register_sysctl_table (kqswnal_top_ctl_table, 0);
-#endif
-
-       PORTAL_SYMBOL_REGISTER(kqswnal_ni);
        kqswnal_data.kqn_init = KQN_INIT_ALL;
 
        printk(KERN_INFO "Lustre: Routing QSW NAL loaded on node %d of %d "
@@ -800,9 +742,55 @@ kqswnal_initialise (void)
               kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled",
               pkmem);
 
-       return (0);
+       return (PTL_OK);
+}
+
+void __exit
+kqswnal_finalise (void)
+{
+#if CONFIG_SYSCTL
+       if (kqswnal_tunables.kqn_sysctl != NULL)
+               unregister_sysctl_table (kqswnal_tunables.kqn_sysctl);
+#endif
+       PtlNIFini(kqswnal_ni);
+
+       ptl_unregister_nal(QSWNAL);
 }
 
+static int __init
+kqswnal_initialise (void)
+{
+       int   rc;
+
+       kqswnal_api.nal_ni_init = kqswnal_startup;
+       kqswnal_api.nal_ni_fini = kqswnal_shutdown;
+
+       /* Initialise dynamic tunables to defaults once only */
+       kqswnal_tunables.kqn_optimized_puts = KQSW_OPTIMIZED_PUTS;
+       kqswnal_tunables.kqn_optimized_gets = KQSW_OPTIMIZED_GETS;
+       
+       rc = ptl_register_nal(QSWNAL, &kqswnal_api);
+       if (rc != PTL_OK) {
+               CERROR("Can't register QSWNAL: %d\n", rc);
+               return (-ENOMEM);               /* or something... */
+       }
+
+       /* Pure gateways, and the workaround for 'EKC blocks forever until
+        * the service is active' want the NAL started up at module load
+        * time... */
+       rc = PtlNIInit(QSWNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kqswnal_ni);
+       if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+               ptl_unregister_nal(QSWNAL);
+               return (-ENODEV);
+       }
+
+#if CONFIG_SYSCTL
+        /* Press on regardless even if registering sysctl doesn't work */
+        kqswnal_tunables.kqn_sysctl = 
+               register_sysctl_table (kqswnal_top_ctl_table, 0);
+#endif
+       return (0);
+}
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Kernel Quadrics/Elan NAL v1.01");
@@ -810,5 +798,3 @@ MODULE_LICENSE("GPL");
 
 module_init (kqswnal_initialise);
 module_exit (kqswnal_finalise);
-
-EXPORT_SYMBOL (kqswnal_ni);
index f96893f..b08d710 100644 (file)
@@ -18,7 +18,7 @@
  *   along with Lustre; if not, write to the Free Software
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
- * Basic library routines.
+ * Basic library routines. 
  *
  */
 
@@ -99,17 +99,18 @@ typedef unsigned long kqsw_csum_t;
 #define KQSW_TX_MAXCONTIG               (1<<10) /* largest payload that gets made contiguous on transmit */
 
 #define KQSW_NTXMSGS                    8       /* # normal transmit messages */
-#define KQSW_NNBLK_TXMSGS               256     /* # reserved transmit messages if can't block */
+#define KQSW_NNBLK_TXMSGS               512     /* # reserved transmit messages if can't block */
 
 #define KQSW_NRXMSGS_LARGE              64      /* # large receive buffers */
-#define KQSW_EP_ENVELOPES_LARGE         128     /* # large ep envelopes */
+#define KQSW_EP_ENVELOPES_LARGE         256     /* # large ep envelopes */
 
 #define KQSW_NRXMSGS_SMALL              256     /* # small receive buffers */
 #define KQSW_EP_ENVELOPES_SMALL         2048    /* # small ep envelopes */
 
 #define KQSW_RESCHED                    100     /* # busy loops that forces scheduler to yield */
 
-#define KQSW_OPTIMIZED_GETS             1       /* optimized gets? */
+#define KQSW_OPTIMIZED_GETS             1       /* optimize gets >= this size */
+#define KQSW_OPTIMIZED_PUTS            (32<<10) /* optimize puts >= this size */
 #define KQSW_COPY_SMALL_FWD             0       /* copy small fwd messages to pre-mapped buffer? */
 
 /*
@@ -157,12 +158,18 @@ typedef struct kqswnal_rx
         int              krx_npages;            /* # pages in receive buffer */
         int              krx_nob;               /* Number Of Bytes received into buffer */
         int              krx_rpc_reply_needed;  /* peer waiting for EKC RPC reply */
-        int              krx_rpc_reply_sent;    /* rpc reply sent */
+        int              krx_rpc_reply_status;  /* what status to send */
+        int              krx_state;             /* what this RX is doing */
         atomic_t         krx_refcount;          /* how to tell when rpc is done */
         kpr_fwd_desc_t   krx_fwd;               /* embedded forwarding descriptor */
         ptl_kiov_t       krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */
 }  kqswnal_rx_t;
 
+#define KRX_POSTED       1                      /* receiving */
+#define KRX_PARSE        2                      /* ready to be parsed */
+#define KRX_COMPLETING   3                      /* waiting to be completed */
+
+
 typedef struct kqswnal_tx
 {
         struct list_head  ktx_list;             /* enqueue idle/active */
@@ -176,16 +183,16 @@ typedef struct kqswnal_tx
         int               ktx_nmappedpages;     /* # pages mapped for current message */
         int               ktx_port;             /* destination ep port */
         ptl_nid_t         ktx_nid;              /* destination node */
-        void             *ktx_args[2];          /* completion passthru */
+        void             *ktx_args[3];          /* completion passthru */
         char             *ktx_buffer;           /* pre-allocated contiguous buffer for hdr + small payloads */
         unsigned long     ktx_launchtime;       /* when (in jiffies) the transmit was launched */
 
         /* debug/info fields */
         pid_t             ktx_launcher;         /* pid of launching process */
-        ptl_hdr_t        *ktx_wire_hdr;         /* portals header (wire endian) */
 
         int               ktx_nfrag;            /* # message frags */
 #if MULTIRAIL_EKC
+        int               ktx_rail;             /* preferred rail */
         EP_NMD            ktx_ebuffer;          /* elan mapping of ktx_buffer */
         EP_NMD            ktx_frags[EP_MAXFRAG];/* elan mapping of msg frags */
 #else
@@ -195,23 +202,28 @@ typedef struct kqswnal_tx
 } kqswnal_tx_t;
 
 #define KTX_IDLE        0                       /* on kqn_(nblk_)idletxds */
-#define KTX_SENDING     1                       /* local send */
-#define KTX_FORWARDING  2                       /* routing a packet */
-#define KTX_GETTING     3                       /* local optimised get */
+#define KTX_FORWARDING  1                       /* sending a forwarded packet */
+#define KTX_SENDING     2                       /* normal send */
+#define KTX_GETTING     3                       /* sending optimised get */
+#define KTX_PUTTING     4                       /* sending optimised put */
+#define KTX_RDMAING     5                       /* handling optimised put/get */
+
+typedef struct
+{
+        /* dynamic tunables... */
+        int                      kqn_optimized_puts;  /* optimized PUTs? */
+        int                      kqn_optimized_gets;  /* optimized GETs? */
+#if CONFIG_SYSCTL
+        struct ctl_table_header *kqn_sysctl;          /* sysctl interface */
+#endif        
+} kqswnal_tunables_t;
 
 typedef struct
 {
         char               kqn_init;            /* what's been initialised */
         char               kqn_shuttingdown;    /* I'm trying to shut down */
-        atomic_t           kqn_nthreads;        /* # threads not terminated */
-        atomic_t           kqn_nthreads_running;/* # threads still running */
-
-        int                kqn_optimized_gets;  /* optimized GETs? */
-        int                kqn_copy_small_fwd;  /* fwd small msgs from pre-allocated buffer? */
+        atomic_t           kqn_nthreads;        /* # threads running */
 
-#if CONFIG_SYSCTL
-        struct ctl_table_header *kqn_sysctl;    /* sysctl interface */
-#endif        
         kqswnal_rx_t      *kqn_rxds;            /* stack of all the receive descriptors */
         kqswnal_tx_t      *kqn_txds;            /* stack of all the transmit descriptors */
 
@@ -221,6 +233,7 @@ typedef struct
         spinlock_t         kqn_idletxd_lock;    /* serialise idle txd access */
         wait_queue_head_t  kqn_idletxd_waitq;   /* sender blocks here waiting for idle txd */
         struct list_head   kqn_idletxd_fwdq;    /* forwarded packets block here waiting for idle txd */
+        atomic_t           kqn_pending_txs;     /* # transmits being prepped */
         
         spinlock_t         kqn_sched_lock;      /* serialise packet schedulers */
         wait_queue_head_t  kqn_sched_waitq;     /* scheduler blocks here */
@@ -229,8 +242,6 @@ typedef struct
         struct list_head   kqn_delayedfwds;     /* delayed forwards */
         struct list_head   kqn_delayedtxds;     /* delayed transmits */
 
-        spinlock_t         kqn_statelock;       /* cb_cli/cb_sti */
-        nal_cb_t          *kqn_cb;              /* -> kqswnal_lib */
 #if MULTIRAIL_EKC
         EP_SYS            *kqn_ep;              /* elan system */
         EP_NMH            *kqn_ep_tx_nmh;       /* elan reserved tx vaddrs */
@@ -248,28 +259,27 @@ typedef struct
         ptl_nid_t          kqn_nid_offset;      /* this cluster's NID offset */
         int                kqn_nnodes;          /* this cluster's size */
         int                kqn_elanid;          /* this nodes's elan ID */
+
+        EP_STATUSBLK       kqn_rpc_success;     /* preset RPC reply status blocks */
+        EP_STATUSBLK       kqn_rpc_failed;
 }  kqswnal_data_t;
 
 /* kqn_init state */
 #define KQN_INIT_NOTHING        0               /* MUST BE ZERO so zeroed state is initialised OK */
 #define KQN_INIT_DATA           1
-#define KQN_INIT_PTL            2
+#define KQN_INIT_LIB            2
 #define KQN_INIT_ALL            3
 
-extern nal_cb_t        kqswnal_lib;
-extern nal_t           kqswnal_api;
-extern kqswnal_data_t  kqswnal_data;
-
-/* global pre-prepared replies to keep off the stack */
-extern EP_STATUSBLK    kqswnal_rpc_success;
-extern EP_STATUSBLK    kqswnal_rpc_failed;
+extern lib_nal_t           kqswnal_lib;
+extern nal_t               kqswnal_api;
+extern kqswnal_tunables_t  kqswnal_tunables;
+extern kqswnal_data_t      kqswnal_data;
 
 extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg);
 extern void kqswnal_rxhandler(EP_RXD *rxd);
 extern int kqswnal_scheduler (void *);
 extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
-extern void kqswnal_dma_reply_complete (EP_RXD *rxd);
-extern void kqswnal_requeue_rx (kqswnal_rx_t *krx);
+extern void kqswnal_rx_done (kqswnal_rx_t *krx);
 
 static inline ptl_nid_t
 kqswnal_elanid2nid (int elanid) 
@@ -288,6 +298,12 @@ kqswnal_nid2elanid (ptl_nid_t nid)
         return (nid - kqswnal_data.kqn_nid_offset);
 }
 
+static inline ptl_nid_t
+kqswnal_rx_nid(kqswnal_rx_t *krx) 
+{
+        return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd)));
+}
+
 static inline int
 kqswnal_pages_spanned (void *base, int nob)
 {
@@ -310,11 +326,11 @@ static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob)
 }
 #endif
 
-static inline void kqswnal_rx_done (kqswnal_rx_t *krx)
+static inline void kqswnal_rx_decref (kqswnal_rx_t *krx)
 {
         LASSERT (atomic_read (&krx->krx_refcount) > 0);
         if (atomic_dec_and_test (&krx->krx_refcount))
-                kqswnal_requeue_rx(krx);
+                kqswnal_rx_done(krx);
 }
 
 #if MULTIRAIL_EKC
index 08453a0..97b5a26 100644 (file)
 
 #include "qswnal.h"
 
-EP_STATUSBLK  kqswnal_rpc_success;
-EP_STATUSBLK  kqswnal_rpc_failed;
-
 /*
  *  LIB functions follow
  *
  */
-static ptl_err_t
-kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
-             size_t len)
-{
-        CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n",
-                nal->ni.nid, len, src_addr, dst_addr );
-        memcpy( dst_addr, src_addr, len );
-
-        return (PTL_OK);
-}
-
-static ptl_err_t
-kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
-              size_t len)
-{
-        CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n",
-                nal->ni.nid, len, src_addr, dst_addr );
-        memcpy( dst_addr, src_addr, len );
-
-        return (PTL_OK);
-}
-
-static void *
-kqswnal_malloc(nal_cb_t *nal, size_t len)
-{
-        void *buf;
-
-        PORTAL_ALLOC(buf, len);
-        return (buf);
-}
-
-static void
-kqswnal_free(nal_cb_t *nal, void *buf, size_t len)
-{
-        PORTAL_FREE(buf, len);
-}
-
-static void
-kqswnal_printf (nal_cb_t * nal, const char *fmt, ...)
-{
-        va_list ap;
-        char msg[256];
-
-        va_start (ap, fmt);
-        vsnprintf (msg, sizeof (msg), fmt, ap);        /* sprint safely */
-        va_end (ap);
-
-        msg[sizeof (msg) - 1] = 0;                /* ensure terminated */
-
-        CDEBUG (D_NET, "%s", msg);
-}
-
-#if (defined(CONFIG_SPARC32) || defined(CONFIG_SPARC64))
-# error "Can't save/restore irq contexts in different procedures"
-#endif
-
-static void
-kqswnal_cli(nal_cb_t *nal, unsigned long *flags)
-{
-        kqswnal_data_t *data= nal->nal_data;
-
-        spin_lock_irqsave(&data->kqn_statelock, *flags);
-}
-
-
-static void
-kqswnal_sti(nal_cb_t *nal, unsigned long *flags)
-{
-        kqswnal_data_t *data= nal->nal_data;
-
-        spin_unlock_irqrestore(&data->kqn_statelock, *flags);
-}
-
-
 static int
-kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+kqswnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
 {
-        if (nid == nal->ni.nid)
+        if (nid == nal->libnal_ni.ni_pid.nid)
                 *dist = 0;                      /* it's me */
         else if (kqswnal_nid2elanid (nid) >= 0)
                 *dist = 1;                      /* it's my peer */
@@ -136,6 +59,8 @@ kqswnal_unmap_tx (kqswnal_tx_t *ktx)
 {
 #if MULTIRAIL_EKC
         int      i;
+
+        ktx->ktx_rail = -1;                     /* unset rail */
 #endif
 
         if (ktx->ktx_nmappedpages == 0)
@@ -174,10 +99,13 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_
         char     *ptr;
 #if MULTIRAIL_EKC
         EP_RAILMASK railmask;
-        int         rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx,
-                                            EP_RAILMASK_ALL,
-                                            kqswnal_nid2elanid(ktx->ktx_nid));
-        
+        int         rail;
+
+        if (ktx->ktx_rail < 0)
+                ktx->ktx_rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx,
+                                                 EP_RAILMASK_ALL,
+                                                 kqswnal_nid2elanid(ktx->ktx_nid));
+        rail = ktx->ktx_rail;
         if (rail < 0) {
                 CERROR("No rails available for "LPX64"\n", ktx->ktx_nid);
                 return (-ENETDOWN);
@@ -201,11 +129,12 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_
         do {
                 int  fraglen = kiov->kiov_len - offset;
 
-                /* nob exactly spans the iovs */
-                LASSERT (fraglen <= nob);
-                /* each frag fits in a page */
+                /* each page frag is contained in one page */
                 LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE);
 
+                if (fraglen > nob)
+                        fraglen = nob;
+
                 nmapped++;
                 if (nmapped > maxmapped) {
                         CERROR("Can't map message in %d pages (max %d)\n",
@@ -291,10 +220,13 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob,
         uint32_t  basepage  = ktx->ktx_basepage + nmapped;
 #if MULTIRAIL_EKC
         EP_RAILMASK railmask;
-        int         rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx,
-                                            EP_RAILMASK_ALL,
-                                            kqswnal_nid2elanid(ktx->ktx_nid));
+        int         rail;
         
+        if (ktx->ktx_rail < 0)
+                ktx->ktx_rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx,
+                                                 EP_RAILMASK_ALL,
+                                                 kqswnal_nid2elanid(ktx->ktx_nid));
+        rail = ktx->ktx_rail;
         if (rail < 0) {
                 CERROR("No rails available for "LPX64"\n", ktx->ktx_nid);
                 return (-ENETDOWN);
@@ -317,11 +249,12 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob,
         
         do {
                 int  fraglen = iov->iov_len - offset;
-                long npages  = kqswnal_pages_spanned (iov->iov_base, fraglen);
-
-                /* nob exactly spans the iovs */
-                LASSERT (fraglen <= nob);
+                long npages;
                 
+                if (fraglen > nob)
+                        fraglen = nob;
+                npages = kqswnal_pages_spanned (iov->iov_base, fraglen);
+
                 nmapped += npages;
                 if (nmapped > maxmapped) {
                         CERROR("Can't map message in %d pages (max %d)\n",
@@ -415,7 +348,8 @@ kqswnal_put_idle_tx (kqswnal_tx_t *ktx)
         list_add (&ktx->ktx_list, &kqswnal_data.kqn_idletxds);
 
         /* anything blocking for a tx descriptor? */
-        if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */
+        if (!kqswnal_data.kqn_shuttingdown &&
+            !list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */
         {
                 CDEBUG(D_NET,"wakeup fwd\n");
 
@@ -449,6 +383,9 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block)
         for (;;) {
                 spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
 
+                if (kqswnal_data.kqn_shuttingdown)
+                        break;
+
                 /* "normal" descriptor is free */
                 if (!list_empty (&kqswnal_data.kqn_idletxds)) {
                         ktx = list_entry (kqswnal_data.kqn_idletxds.next,
@@ -456,14 +393,8 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block)
                         break;
                 }
 
-                /* "normal" descriptor pool is empty */
-
-                if (fwd != NULL) { /* forwarded packet => queue for idle txd */
-                        CDEBUG (D_NET, "blocked fwd [%p]\n", fwd);
-                        list_add_tail (&fwd->kprfd_list,
-                                       &kqswnal_data.kqn_idletxd_fwdq);
+                if (fwd != NULL)                /* forwarded packet? */
                         break;
-                }
 
                 /* doing a local transmit */
                 if (!may_block) {
@@ -483,13 +414,20 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block)
 
                 CDEBUG (D_NET, "blocking for tx desc\n");
                 wait_event (kqswnal_data.kqn_idletxd_waitq,
-                            !list_empty (&kqswnal_data.kqn_idletxds));
+                            !list_empty (&kqswnal_data.kqn_idletxds) ||
+                            kqswnal_data.kqn_shuttingdown);
         }
 
         if (ktx != NULL) {
                 list_del (&ktx->ktx_list);
                 list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds);
                 ktx->ktx_launcher = current->pid;
+                atomic_inc(&kqswnal_data.kqn_pending_txs);
+        } else if (fwd != NULL) {
+                /* queue forwarded packet until idle txd available */
+                CDEBUG (D_NET, "blocked fwd [%p]\n", fwd);
+                list_add_tail (&fwd->kprfd_list,
+                               &kqswnal_data.kqn_idletxd_fwdq);
         }
 
         spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
@@ -503,40 +441,29 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block)
 void
 kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
 {
-        lib_msg_t     *msg;
-        lib_msg_t     *repmsg = NULL;
-
         switch (ktx->ktx_state) {
         case KTX_FORWARDING:       /* router asked me to forward this packet */
                 kpr_fwd_done (&kqswnal_data.kqn_router,
                               (kpr_fwd_desc_t *)ktx->ktx_args[0], error);
                 break;
 
-        case KTX_SENDING:          /* packet sourced locally */
-                lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
+        case KTX_RDMAING:          /* optimized GET/PUT handled */
+        case KTX_PUTTING:          /* optimized PUT sent */
+        case KTX_SENDING:          /* normal send */
+                lib_finalize (&kqswnal_lib, NULL,
                               (lib_msg_t *)ktx->ktx_args[1],
-                              (error == 0) ? PTL_OK : 
-                              (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL);
+                              (error == 0) ? PTL_OK : PTL_FAIL);
                 break;
 
-        case KTX_GETTING:          /* Peer has DMA-ed direct? */
-                msg = (lib_msg_t *)ktx->ktx_args[1];
-
-                if (error == 0) {
-                        repmsg = lib_fake_reply_msg (&kqswnal_lib, 
-                                                     ktx->ktx_nid, msg->md);
-                        if (repmsg == NULL)
-                                error = -ENOMEM;
-                }
-                
-                if (error == 0) {
-                        lib_finalize (&kqswnal_lib, ktx->ktx_args[0], 
-                                      msg, PTL_OK);
-                        lib_finalize (&kqswnal_lib, NULL, repmsg, PTL_OK);
-                } else {
-                        lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg,
-                                      (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL);
-                }
+        case KTX_GETTING:          /* optimized GET sent & REPLY received */
+                /* Complete the GET with success since we can't avoid
+                 * delivering a REPLY event; we committed to it when we
+                 * launched the GET */
+                lib_finalize (&kqswnal_lib, NULL, 
+                              (lib_msg_t *)ktx->ktx_args[1], PTL_OK);
+                lib_finalize (&kqswnal_lib, NULL,
+                              (lib_msg_t *)ktx->ktx_args[2],
+                              (error == 0) ? PTL_OK : PTL_FAIL);
                 break;
 
         default:
@@ -564,16 +491,27 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
                 kqswnal_notify_peer_down(ktx);
                 status = -EHOSTDOWN;
 
-        } else if (ktx->ktx_state == KTX_GETTING) {
-                /* RPC completed OK; what did our peer put in the status
+        } else switch (ktx->ktx_state) {
+
+        case KTX_GETTING:
+        case KTX_PUTTING:
+                /* RPC completed OK; but what did our peer put in the status
                  * block? */
 #if MULTIRAIL_EKC
                 status = ep_txd_statusblk(txd)->Data[0];
 #else
                 status = ep_txd_statusblk(txd)->Status;
 #endif
-        } else {
+                break;
+                
+        case KTX_FORWARDING:
+        case KTX_SENDING:
                 status = 0;
+                break;
+                
+        default:
+                LBUG();
+                break;
         }
 
         kqswnal_tx_done (ktx, status);
@@ -590,22 +528,29 @@ kqswnal_launch (kqswnal_tx_t *ktx)
 
         ktx->ktx_launchtime = jiffies;
 
+        if (kqswnal_data.kqn_shuttingdown)
+                return (-ESHUTDOWN);
+
         LASSERT (dest >= 0);                    /* must be a peer */
-        if (ktx->ktx_state == KTX_GETTING) {
-                /* NB ktx_frag[0] is the GET hdr + kqswnal_remotemd_t.  The
-                 * other frags are the GET sink which we obviously don't
-                 * send here :) */
+
 #if MULTIRAIL_EKC
+        if (ktx->ktx_nmappedpages != 0)
+                attr = EP_SET_PREFRAIL(attr, ktx->ktx_rail);
+#endif
+
+        switch (ktx->ktx_state) {
+        case KTX_GETTING:
+        case KTX_PUTTING:
+                /* NB ktx_frag[0] is the GET/PUT hdr + kqswnal_remotemd_t.
+                 * The other frags are the payload, awaiting RDMA */
                 rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest,
                                      ktx->ktx_port, attr,
                                      kqswnal_txhandler, ktx,
                                      NULL, ktx->ktx_frags, 1);
-#else
-                rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest,
-                                     ktx->ktx_port, attr, kqswnal_txhandler,
-                                     ktx, NULL, ktx->ktx_frags, 1);
-#endif
-        } else {
+                break;
+
+        case KTX_FORWARDING:
+        case KTX_SENDING:
 #if MULTIRAIL_EKC
                 rc = ep_transmit_message(kqswnal_data.kqn_eptx, dest,
                                          ktx->ktx_port, attr,
@@ -617,6 +562,12 @@ kqswnal_launch (kqswnal_tx_t *ktx)
                                        kqswnal_txhandler, ktx, 
                                        ktx->ktx_frags, ktx->ktx_nfrag);
 #endif
+                break;
+                
+        default:
+                LBUG();
+                rc = -EINVAL;                   /* no compiler warning please */
+                break;
         }
 
         switch (rc) {
@@ -624,8 +575,6 @@ kqswnal_launch (kqswnal_tx_t *ktx)
                 return (0);
 
         case EP_ENOMEM: /* can't allocate ep txd => queue for later */
-                LASSERT (in_interrupt());
-
                 spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
 
                 list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds);
@@ -641,6 +590,7 @@ kqswnal_launch (kqswnal_tx_t *ktx)
         }
 }
 
+#if 0
 static char *
 hdr_type_string (ptl_hdr_t *hdr)
 {
@@ -664,42 +614,42 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr)
         char *type_str = hdr_type_string (hdr);
 
         CERROR("P3 Header at %p of type %s length %d\n", hdr, type_str,
-               NTOH__u32(hdr->payload_length));
-        CERROR("    From nid/pid "LPU64"/%u\n", NTOH__u64(hdr->src_nid),
-               NTOH__u32(hdr->src_pid));
-        CERROR("    To nid/pid "LPU64"/%u\n", NTOH__u64(hdr->dest_nid),
-               NTOH__u32(hdr->dest_pid));
+               le32_to_cpu(hdr->payload_length));
+        CERROR("    From nid/pid "LPU64"/%u\n", le64_to_cpu(hdr->src_nid),
+               le32_to_cpu(hdr->src_pid));
+        CERROR("    To nid/pid "LPU64"/%u\n", le64_to_cpu(hdr->dest_nid),
+               le32_to_cpu(hdr->dest_pid));
 
-        switch (NTOH__u32(hdr->type)) {
+        switch (le32_to_cpu(hdr->type)) {
         case PTL_MSG_PUT:
                 CERROR("    Ptl index %d, ack md "LPX64"."LPX64", "
                        "match bits "LPX64"\n",
-                       NTOH__u32 (hdr->msg.put.ptl_index),
+                       le32_to_cpu(hdr->msg.put.ptl_index),
                        hdr->msg.put.ack_wmd.wh_interface_cookie,
                        hdr->msg.put.ack_wmd.wh_object_cookie,
-                       NTOH__u64 (hdr->msg.put.match_bits));
+                       le64_to_cpu(hdr->msg.put.match_bits));
                 CERROR("    offset %d, hdr data "LPX64"\n",
-                       NTOH__u32(hdr->msg.put.offset),
+                       le32_to_cpu(hdr->msg.put.offset),
                        hdr->msg.put.hdr_data);
                 break;
 
         case PTL_MSG_GET:
                 CERROR("    Ptl index %d, return md "LPX64"."LPX64", "
                        "match bits "LPX64"\n",
-                       NTOH__u32 (hdr->msg.get.ptl_index),
+                       le32_to_cpu(hdr->msg.get.ptl_index),
                        hdr->msg.get.return_wmd.wh_interface_cookie,
                        hdr->msg.get.return_wmd.wh_object_cookie,
                        hdr->msg.get.match_bits);
                 CERROR("    Length %d, src offset %d\n",
-                       NTOH__u32 (hdr->msg.get.sink_length),
-                       NTOH__u32 (hdr->msg.get.src_offset));
+                       le32_to_cpu(hdr->msg.get.sink_length),
+                       le32_to_cpu(hdr->msg.get.src_offset));
                 break;
 
         case PTL_MSG_ACK:
                 CERROR("    dst md "LPX64"."LPX64", manipulated length %d\n",
                        hdr->msg.ack.dst_wmd.wh_interface_cookie,
                        hdr->msg.ack.dst_wmd.wh_object_cookie,
-                       NTOH__u32 (hdr->msg.ack.mlength));
+                       le32_to_cpu(hdr->msg.ack.mlength));
                 break;
 
         case PTL_MSG_REPLY:
@@ -709,6 +659,7 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr)
         }
 
 }                               /* end of print_hdr() */
+#endif
 
 #if !MULTIRAIL_EKC
 void
@@ -770,114 +721,297 @@ kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv,
         CERROR ("DATAVEC too small\n");
         return (-E2BIG);
 }
+#else
+int
+kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag,
+                    int nrfrag, EP_NMD *rfrag)
+{
+        int  i;
+
+        if (nlfrag != nrfrag) {
+                CERROR("Can't cope with unequal # frags: %d local %d remote\n",
+                       nlfrag, nrfrag);
+                return (-EINVAL);
+        }
+        
+        for (i = 0; i < nlfrag; i++)
+                if (lfrag[i].nmd_len != rfrag[i].nmd_len) {
+                        CERROR("Can't cope with unequal frags %d(%d):"
+                               " %d local %d remote\n",
+                               i, nlfrag, lfrag[i].nmd_len, rfrag[i].nmd_len);
+                        return (-EINVAL);
+                }
+        
+        return (0);
+}
 #endif
 
-int
-kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, 
-                   struct iovec *iov, ptl_kiov_t *kiov, 
-                   int offset, int nob)
+kqswnal_remotemd_t *
+kqswnal_parse_rmd (kqswnal_rx_t *krx, int type, ptl_nid_t expected_nid)
 {
-        kqswnal_rx_t       *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
         char               *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page);
+        ptl_hdr_t          *hdr = (ptl_hdr_t *)buffer;
         kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE);
-        int                 rc;
-#if MULTIRAIL_EKC
-        int                 i;
-#else
-        EP_DATAVEC          datav[EP_MAXFRAG];
-        int                 ndatav;
-#endif
-        LASSERT (krx->krx_rpc_reply_needed);
-        LASSERT ((iov == NULL) != (kiov == NULL));
+        ptl_nid_t           nid = kqswnal_rx_nid(krx);
+
+        /* Note (1) lib_parse has already flipped hdr.
+         *      (2) RDMA addresses are sent in native endian-ness.  When
+         *      EKC copes with different endian nodes, I'll fix this (and
+         *      eat my hat :) */
+
+        LASSERT (krx->krx_nob >= sizeof(*hdr));
+
+        if (hdr->type != type) {
+                CERROR ("Unexpected optimized get/put type %d (%d expected)"
+                        "from "LPX64"\n", hdr->type, type, nid);
+                return (NULL);
+        }
+        
+        if (hdr->src_nid != nid) {
+                CERROR ("Unexpected optimized get/put source NID "
+                        LPX64" from "LPX64"\n", hdr->src_nid, nid);
+                return (NULL);
+        }
+
+        LASSERT (nid == expected_nid);
 
-        /* see kqswnal_sendmsg comment regarding endian-ness */
         if (buffer + krx->krx_nob < (char *)(rmd + 1)) {
                 /* msg too small to discover rmd size */
                 CERROR ("Incoming message [%d] too small for RMD (%d needed)\n",
                         krx->krx_nob, (int)(((char *)(rmd + 1)) - buffer));
-                return (-EINVAL);
+                return (NULL);
         }
-        
+
         if (buffer + krx->krx_nob < (char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) {
                 /* rmd doesn't fit in the incoming message */
                 CERROR ("Incoming message [%d] too small for RMD[%d] (%d needed)\n",
                         krx->krx_nob, rmd->kqrmd_nfrag,
                         (int)(((char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) - buffer));
-                return (-EINVAL);
+                return (NULL);
         }
 
-        /* Map the source data... */
+        return (rmd);
+}
+
+void
+kqswnal_rdma_store_complete (EP_RXD *rxd) 
+{
+        int           status = ep_rxd_status(rxd);
+        kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd);
+        kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
+        
+        CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR,
+               "rxd %p, ktx %p, status %d\n", rxd, ktx, status);
+
+        LASSERT (ktx->ktx_state == KTX_RDMAING);
+        LASSERT (krx->krx_rxd == rxd);
+        LASSERT (krx->krx_rpc_reply_needed);
+
+        krx->krx_rpc_reply_needed = 0;
+        kqswnal_rx_decref (krx);
+
+        /* free ktx & finalize() its lib_msg_t */
+        kqswnal_tx_done(ktx, (status == EP_SUCCESS) ? 0 : -ECONNABORTED);
+}
+
+void
+kqswnal_rdma_fetch_complete (EP_RXD *rxd) 
+{
+        /* Completed fetching the PUT data */
+        int           status = ep_rxd_status(rxd);
+        kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd);
+        kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
+        unsigned long flags;
+        
+        CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR,
+               "rxd %p, ktx %p, status %d\n", rxd, ktx, status);
+
+        LASSERT (ktx->ktx_state == KTX_RDMAING);
+        LASSERT (krx->krx_rxd == rxd);
+        LASSERT (krx->krx_rpc_reply_needed);
+
+        /* Set the RPC completion status */
+        status = (status == EP_SUCCESS) ? 0 : -ECONNABORTED;
+        krx->krx_rpc_reply_status = status;
+
+        /* free ktx & finalize() its lib_msg_t */
+        kqswnal_tx_done(ktx, status);
+
+        if (!in_interrupt()) {
+                /* OK to complete the RPC now (iff I had the last ref) */
+                kqswnal_rx_decref (krx);
+                return;
+        }
+
+        LASSERT (krx->krx_state == KRX_PARSE);
+        krx->krx_state = KRX_COMPLETING;
+
+        /* Complete the RPC in thread context */
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds);
+        wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+}
+
+int
+kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type,
+              int niov, struct iovec *iov, ptl_kiov_t *kiov,
+              size_t offset, size_t len)
+{
+        kqswnal_remotemd_t *rmd;
+        kqswnal_tx_t       *ktx;
+        int                 eprc;
+        int                 rc;
+#if !MULTIRAIL_EKC
+        EP_DATAVEC          datav[EP_MAXFRAG];
+        int                 ndatav;
+#endif
+
+        LASSERT (type == PTL_MSG_GET || type == PTL_MSG_PUT);
+        /* Not both mapped and paged payload */
+        LASSERT (iov == NULL || kiov == NULL);
+        /* RPC completes with failure by default */
+        LASSERT (krx->krx_rpc_reply_needed);
+        LASSERT (krx->krx_rpc_reply_status != 0);
+
+        rmd = kqswnal_parse_rmd(krx, type, libmsg->ev.initiator.nid);
+        if (rmd == NULL)
+                return (-EPROTO);
+
+        if (len == 0) {
+                /* data got truncated to nothing. */
+                lib_finalize(&kqswnal_lib, krx, libmsg, PTL_OK);
+                /* Let kqswnal_rx_done() complete the RPC with success */
+                krx->krx_rpc_reply_status = 0;
+                return (0);
+        }
+        
+        /* NB I'm using 'ktx' just to map the local RDMA buffers; I'm not
+           actually sending a portals message with it */
+        ktx = kqswnal_get_idle_tx(NULL, 0);
+        if (ktx == NULL) {
+                CERROR ("Can't get txd for RDMA with "LPX64"\n",
+                        libmsg->ev.initiator.nid);
+                return (-ENOMEM);
+        }
+
+        ktx->ktx_state   = KTX_RDMAING;
+        ktx->ktx_nid     = libmsg->ev.initiator.nid;
+        ktx->ktx_args[0] = krx;
+        ktx->ktx_args[1] = libmsg;
+
+#if MULTIRAIL_EKC
+        /* Map on the rail the RPC prefers */
+        ktx->ktx_rail = ep_rcvr_prefrail(krx->krx_eprx,
+                                         ep_rxd_railmask(krx->krx_rxd));
+#endif
+
+        /* Start mapping at offset 0 (we're not mapping any headers) */
         ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
+        
         if (kiov != NULL)
-                rc = kqswnal_map_tx_kiov (ktx, offset, nob, nfrag, kiov);
+                rc = kqswnal_map_tx_kiov(ktx, offset, len, niov, kiov);
         else
-                rc = kqswnal_map_tx_iov (ktx, offset, nob, nfrag, iov);
+                rc = kqswnal_map_tx_iov(ktx, offset, len, niov, iov);
 
         if (rc != 0) {
-                CERROR ("Can't map source data: %d\n", rc);
-                return (rc);
+                CERROR ("Can't map local RDMA data: %d\n", rc);
+                goto out;
         }
 
 #if MULTIRAIL_EKC
-        if (ktx->ktx_nfrag != rmd->kqrmd_nfrag) {
-                CERROR("Can't cope with unequal # frags: %d local %d remote\n",
-                       ktx->ktx_nfrag, rmd->kqrmd_nfrag);
-                return (-EINVAL);
+        rc = kqswnal_check_rdma (ktx->ktx_nfrag, ktx->ktx_frags,
+                                 rmd->kqrmd_nfrag, rmd->kqrmd_frag);
+        if (rc != 0) {
+                CERROR ("Incompatible RDMA descriptors\n");
+                goto out;
         }
-        
-        for (i = 0; i < rmd->kqrmd_nfrag; i++)
-                if (ktx->ktx_frags[i].nmd_len != rmd->kqrmd_frag[i].nmd_len) {
-                        CERROR("Can't cope with unequal frags %d(%d):"
-                               " %d local %d remote\n",
-                               i, rmd->kqrmd_nfrag, 
-                               ktx->ktx_frags[i].nmd_len, 
-                               rmd->kqrmd_frag[i].nmd_len);
-                        return (-EINVAL);
-                }
 #else
-        ndatav = kqswnal_eiovs2datav (EP_MAXFRAG, datav,
-                                      ktx->ktx_nfrag, ktx->ktx_frags,
-                                      rmd->kqrmd_nfrag, rmd->kqrmd_frag);
+        switch (type) {
+        default:
+                LBUG();
+
+        case PTL_MSG_GET:
+                ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav,
+                                             ktx->ktx_nfrag, ktx->ktx_frags,
+                                             rmd->kqrmd_nfrag, rmd->kqrmd_frag);
+                break;
+
+        case PTL_MSG_PUT:
+                ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav,
+                                             rmd->kqrmd_nfrag, rmd->kqrmd_frag,
+                                             ktx->ktx_nfrag, ktx->ktx_frags);
+                break;
+        }
+                
         if (ndatav < 0) {
                 CERROR ("Can't create datavec: %d\n", ndatav);
-                return (ndatav);
+                rc = ndatav;
+                goto out;
         }
 #endif
 
-        /* Our caller will start to race with kqswnal_dma_reply_complete... */
-        LASSERT (atomic_read (&krx->krx_refcount) == 1);
-        atomic_set (&krx->krx_refcount, 2);
+        LASSERT (atomic_read(&krx->krx_refcount) > 0);
+        /* Take an extra ref for the completion callback */
+        atomic_inc(&krx->krx_refcount);
 
-#if MULTIRAIL_EKC
-        rc = ep_complete_rpc(krx->krx_rxd, kqswnal_dma_reply_complete, ktx, 
-                             &kqswnal_rpc_success,
-                             ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag);
-        if (rc == EP_SUCCESS)
-                return (0);
+        switch (type) {
+        default:
+                LBUG();
 
-        /* Well we tried... */
-        krx->krx_rpc_reply_needed = 0;
+        case PTL_MSG_GET:
+#if MULTIRAIL_EKC
+                eprc = ep_complete_rpc(krx->krx_rxd, 
+                                       kqswnal_rdma_store_complete, ktx, 
+                                       &kqswnal_data.kqn_rpc_success,
+                                       ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag);
 #else
-        rc = ep_complete_rpc (krx->krx_rxd, kqswnal_dma_reply_complete, ktx,
-                              &kqswnal_rpc_success, datav, ndatav);
-        if (rc == EP_SUCCESS)
-                return (0);
-
-        /* "old" EKC destroys rxd on failed completion */
-        krx->krx_rxd = NULL;
+                eprc = ep_complete_rpc (krx->krx_rxd, 
+                                        kqswnal_rdma_store_complete, ktx,
+                                        &kqswnal_data.kqn_rpc_success, 
+                                        datav, ndatav);
+                if (eprc != EP_SUCCESS) /* "old" EKC destroys rxd on failed completion */
+                        krx->krx_rxd = NULL;
 #endif
+                if (eprc != EP_SUCCESS) {
+                        CERROR("can't complete RPC: %d\n", eprc);
+                        /* don't re-attempt RPC completion */
+                        krx->krx_rpc_reply_needed = 0;
+                        rc = -ECONNABORTED;
+                }
+                break;
+                
+        case PTL_MSG_PUT:
+#if MULTIRAIL_EKC
+                eprc = ep_rpc_get (krx->krx_rxd, 
+                                   kqswnal_rdma_fetch_complete, ktx,
+                                   rmd->kqrmd_frag, ktx->ktx_frags, ktx->ktx_nfrag);
+#else
+                eprc = ep_rpc_get (krx->krx_rxd,
+                                   kqswnal_rdma_fetch_complete, ktx,
+                                   datav, ndatav);
+#endif
+                if (eprc != EP_SUCCESS) {
+                        CERROR("ep_rpc_get failed: %d\n", eprc);
+                        rc = -ECONNABORTED;
+                }
+                break;
+        }
 
-        CERROR("can't complete RPC: %d\n", rc);
-
-        /* reset refcount back to 1: we're not going to be racing with
-         * kqswnal_dma_reply_complete. */
-        atomic_set (&krx->krx_refcount, 1);
+ out:
+        if (rc != 0) {
+                kqswnal_rx_decref(krx);                 /* drop callback's ref */
+                kqswnal_put_idle_tx (ktx);
+        }
 
-        return (-ECONNABORTED);
+        atomic_dec(&kqswnal_data.kqn_pending_txs);
+        return (rc);
 }
 
 static ptl_err_t
-kqswnal_sendmsg (nal_cb_t     *nal,
+kqswnal_sendmsg (lib_nal_t    *nal,
                  void         *private,
                  lib_msg_t    *libmsg,
                  ptl_hdr_t    *hdr,
@@ -899,6 +1033,8 @@ kqswnal_sendmsg (nal_cb_t     *nal,
         int                sumoff;
         int                sumnob;
 #endif
+        /* NB 1. hdr is in network byte order */
+        /*    2. 'private' depends on the message type */
         
         CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64
                " pid %u\n", payload_nob, payload_niov, nid, pid);
@@ -910,13 +1046,22 @@ kqswnal_sendmsg (nal_cb_t     *nal,
         LASSERT (payload_kiov == NULL || !in_interrupt ());
         /* payload is either all vaddrs or all pages */
         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
-        
+
         if (payload_nob > KQSW_MAXPAYLOAD) {
                 CERROR ("request exceeds MTU size "LPSZ" (max %u).\n",
                         payload_nob, KQSW_MAXPAYLOAD);
                 return (PTL_FAIL);
         }
 
+        if (type == PTL_MSG_REPLY &&            /* can I look in 'private' */
+            ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { /* is it an RPC */
+                /* Must be a REPLY for an optimized GET */
+                rc = kqswnal_rdma ((kqswnal_rx_t *)private, libmsg, PTL_MSG_GET,
+                                   payload_niov, payload_iov, payload_kiov, 
+                                   payload_offset, payload_nob);
+                return ((rc == 0) ? PTL_OK : PTL_FAIL);
+        }
+
         targetnid = nid;
         if (kqswnal_nid2elanid (nid) < 0) {     /* Can't send direct: find gateway? */
                 rc = kpr_lookup (&kqswnal_data.kqn_router, nid, 
@@ -939,40 +1084,18 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                                           type == PTL_MSG_REPLY ||
                                           in_interrupt()));
         if (ktx == NULL) {
-                kqswnal_cerror_hdr (hdr);
-                return (PTL_NOSPACE);
+                CERROR ("Can't get txd for msg type %d for "LPX64"\n",
+                        type, libmsg->ev.initiator.nid);
+                return (PTL_NO_SPACE);
         }
 
+        ktx->ktx_state   = KTX_SENDING;
         ktx->ktx_nid     = targetnid;
         ktx->ktx_args[0] = private;
         ktx->ktx_args[1] = libmsg;
-
-        if (type == PTL_MSG_REPLY &&
-            ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) {
-                if (nid != targetnid ||
-                    kqswnal_nid2elanid(nid) != 
-                    ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)) {
-                        CERROR("Optimized reply nid conflict: "
-                               "nid "LPX64" via "LPX64" elanID %d\n",
-                               nid, targetnid,
-                               ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd));
-                        return (PTL_FAIL);
-                }
-
-                /* peer expects RPC completion with GET data */
-                rc = kqswnal_dma_reply (ktx, payload_niov, 
-                                        payload_iov, payload_kiov, 
-                                        payload_offset, payload_nob);
-                if (rc == 0)
-                        return (PTL_OK);
-                
-                CERROR ("Can't DMA reply to "LPX64": %d\n", nid, rc);
-                kqswnal_put_idle_tx (ktx);
-                return (PTL_FAIL);
-        }
+        ktx->ktx_args[2] = NULL;    /* set when a GET commits to REPLY */
 
         memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */
-        ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer;
 
 #if KQSW_CHECKSUM
         csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr));
@@ -1012,28 +1135,31 @@ kqswnal_sendmsg (nal_cb_t     *nal,
         memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum));
 #endif
 
-        if (kqswnal_data.kqn_optimized_gets &&
-            type == PTL_MSG_GET &&              /* doing a GET */
-            nid == targetnid) {                 /* not forwarding */
+        /* The first frag will be the pre-mapped buffer for (at least) the
+         * portals header. */
+        ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1;
+
+        if (nid == targetnid &&                 /* not forwarding */
+            ((type == PTL_MSG_GET &&            /* optimize GET? */
+              kqswnal_tunables.kqn_optimized_gets != 0 &&
+              le32_to_cpu(hdr->msg.get.sink_length) >= kqswnal_tunables.kqn_optimized_gets) ||
+             (type == PTL_MSG_PUT &&            /* optimize PUT? */
+              kqswnal_tunables.kqn_optimized_puts != 0 &&
+              payload_nob >= kqswnal_tunables.kqn_optimized_puts))) {
                 lib_md_t           *md = libmsg->md;
                 kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(ktx->ktx_buffer + KQSW_HDR_SIZE);
                 
-                /* Optimised path: I send over the Elan vaddrs of the get
-                 * sink buffers, and my peer DMAs directly into them.
+                /* Optimised path: I send over the Elan vaddrs of the local
+                 * buffers, and my peer DMAs directly to/from them.
                  *
                  * First I set up ktx as if it was going to send this
                  * payload, (it needs to map it anyway).  This fills
                  * ktx_frags[1] and onward with the network addresses
                  * of the GET sink frags.  I copy these into ktx_buffer,
-                 * immediately after the header, and send that as my GET
-                 * message.
-                 *
-                 * Note that the addresses are sent in native endian-ness.
-                 * When EKC copes with different endian nodes, I'll fix
-                 * this (and eat my hat :) */
+                 * immediately after the header, and send that as my
+                 * message. */
 
-                ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1;
-                ktx->ktx_state = KTX_GETTING;
+                ktx->ktx_state = (type == PTL_MSG_PUT) ? KTX_PUTTING : KTX_GETTING;
 
                 if ((libmsg->md->options & PTL_MD_KIOV) != 0) 
                         rc = kqswnal_map_tx_kiov (ktx, 0, md->length,
@@ -1041,11 +1167,8 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 else
                         rc = kqswnal_map_tx_iov (ktx, 0, md->length,
                                                  md->md_niov, md->md_iov.iov);
-
-                if (rc < 0) {
-                        kqswnal_put_idle_tx (ktx);
-                        return (PTL_FAIL);
-                }
+                if (rc != 0)
+                        goto out;
 
                 rmd->kqrmd_nfrag = ktx->ktx_nfrag - 1;
 
@@ -1066,12 +1189,21 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 ktx->ktx_frags[0].Base = ktx->ktx_ebuffer;
                 ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob;
 #endif
+                if (type == PTL_MSG_GET) {
+                        /* Allocate reply message now while I'm in thread context */
+                        ktx->ktx_args[2] = lib_create_reply_msg (&kqswnal_lib,
+                                                                 nid, libmsg);
+                        if (ktx->ktx_args[2] == NULL)
+                                goto out;
+
+                        /* NB finalizing the REPLY message is my
+                         * responsibility now, whatever happens. */
+                }
+                
         } else if (payload_nob <= KQSW_TX_MAXCONTIG) {
 
                 /* small message: single frag copied into the pre-mapped buffer */
 
-                ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1;
-                ktx->ktx_state = KTX_SENDING;
 #if MULTIRAIL_EKC
                 ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
                               0, KQSW_HDR_SIZE + payload_nob);
@@ -1093,8 +1225,6 @@ kqswnal_sendmsg (nal_cb_t     *nal,
 
                 /* large message: multiple frags: first is hdr in pre-mapped buffer */
 
-                ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1;
-                ktx->ktx_state = KTX_SENDING;
 #if MULTIRAIL_EKC
                 ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
                               0, KQSW_HDR_SIZE);
@@ -1108,29 +1238,44 @@ kqswnal_sendmsg (nal_cb_t     *nal,
                 else
                         rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob,
                                                  payload_niov, payload_iov);
-                if (rc != 0) {
-                        kqswnal_put_idle_tx (ktx);
-                        return (PTL_FAIL);
-                }
+                if (rc != 0)
+                        goto out;
         }
         
         ktx->ktx_port = (payload_nob <= KQSW_SMALLPAYLOAD) ?
                         EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE;
 
         rc = kqswnal_launch (ktx);
-        if (rc != 0) {                    /* failed? */
-                CERROR ("Failed to send packet to "LPX64": %d\n", targetnid, rc);
+
+ out:
+        CDEBUG(rc == 0 ? D_NET : D_ERROR, 
+               "%s "LPSZ" bytes to "LPX64" via "LPX64": rc %d\n", 
+               rc == 0 ? "Sent" : "Failed to send",
+               payload_nob, nid, targetnid, rc);
+
+        if (rc != 0) {
+                if (ktx->ktx_state == KTX_GETTING &&
+                    ktx->ktx_args[2] != NULL) {
+                        /* We committed to reply, but there was a problem
+                         * launching the GET.  We can't avoid delivering a
+                         * REPLY event since we committed above, so we
+                         * pretend the GET succeeded but the REPLY
+                         * failed. */
+                        rc = 0;
+                        lib_finalize (&kqswnal_lib, private, libmsg, PTL_OK);
+                        lib_finalize (&kqswnal_lib, private,
+                                      (lib_msg_t *)ktx->ktx_args[2], PTL_FAIL);
+                }
+                
                 kqswnal_put_idle_tx (ktx);
-                return (PTL_FAIL);
         }
-
-        CDEBUG(D_NET, "sent "LPSZ" bytes to "LPX64" via "LPX64"\n", 
-               payload_nob, nid, targetnid);
-        return (PTL_OK);
+        
+        atomic_dec(&kqswnal_data.kqn_pending_txs);
+        return (rc == 0 ? PTL_OK : PTL_FAIL);
 }
 
 static ptl_err_t
-kqswnal_send (nal_cb_t     *nal,
+kqswnal_send (lib_nal_t    *nal,
               void         *private,
               lib_msg_t    *libmsg,
               ptl_hdr_t    *hdr,
@@ -1148,7 +1293,7 @@ kqswnal_send (nal_cb_t     *nal,
 }
 
 static ptl_err_t
-kqswnal_send_pages (nal_cb_t     *nal,
+kqswnal_send_pages (lib_nal_t    *nal,
                     void         *private,
                     lib_msg_t    *libmsg,
                     ptl_hdr_t    *hdr,
@@ -1187,18 +1332,17 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
         if (ktx == NULL)        /* can't get txd right now */
                 return;         /* fwd will be scheduled when tx desc freed */
 
-        if (nid == kqswnal_lib.ni.nid)          /* gateway is me */
+        if (nid == kqswnal_lib.libnal_ni.ni_pid.nid) /* gateway is me */
                 nid = fwd->kprfd_target_nid;    /* target is final dest */
 
         if (kqswnal_nid2elanid (nid) < 0) {
                 CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid);
                 rc = -EHOSTUNREACH;
-                goto failed;
+                goto out;
         }
 
         /* copy hdr into pre-mapped buffer */
         memcpy(ktx->ktx_buffer, fwd->kprfd_hdr, sizeof(ptl_hdr_t));
-        ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer;
 
         ktx->ktx_port    = (nob <= KQSW_SMALLPAYLOAD) ?
                            EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE;
@@ -1233,20 +1377,19 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
 #endif
                 rc = kqswnal_map_tx_kiov (ktx, 0, nob, niov, kiov);
                 if (rc != 0)
-                        goto failed;
+                        goto out;
         }
 
         rc = kqswnal_launch (ktx);
-        if (rc == 0)
-                return;
+ out:
+        if (rc != 0) {
+                CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc);
 
- failed:
-        LASSERT (rc != 0);
-        CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc);
+                /* complete now (with failure) */
+                kqswnal_tx_done (ktx, rc);
+        }
 
-        kqswnal_put_idle_tx (ktx);
-        /* complete now (with failure) */
-        kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc);
+        atomic_dec(&kqswnal_data.kqn_pending_txs);
 }
 
 void
@@ -1261,32 +1404,51 @@ kqswnal_fwd_callback (void *arg, int error)
                 ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page);
 
                 CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n",
-                       NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error);
+                       le64_to_cpu(hdr->src_nid), le64_to_cpu(hdr->dest_nid),error);
         }
 
-        kqswnal_requeue_rx (krx);
+        LASSERT (atomic_read(&krx->krx_refcount) == 1);
+        kqswnal_rx_decref (krx);
 }
 
 void
-kqswnal_dma_reply_complete (EP_RXD *rxd) 
+kqswnal_requeue_rx (kqswnal_rx_t *krx)
 {
-        int           status = ep_rxd_status(rxd);
-        kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd);
-        kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
-        lib_msg_t    *msg = (lib_msg_t *)ktx->ktx_args[1];
-        
-        CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR,
-               "rxd %p, ktx %p, status %d\n", rxd, ktx, status);
+        LASSERT (atomic_read(&krx->krx_refcount) == 0);
+        LASSERT (!krx->krx_rpc_reply_needed);
 
-        LASSERT (krx->krx_rxd == rxd);
-        LASSERT (krx->krx_rpc_reply_needed);
+        krx->krx_state = KRX_POSTED;
 
-        krx->krx_rpc_reply_needed = 0;
-        kqswnal_rx_done (krx);
+#if MULTIRAIL_EKC
+        if (kqswnal_data.kqn_shuttingdown) {
+                /* free EKC rxd on shutdown */
+                ep_complete_receive(krx->krx_rxd);
+        } else {
+                /* repost receive */
+                ep_requeue_receive(krx->krx_rxd, 
+                                   kqswnal_rxhandler, krx,
+                                   &krx->krx_elanbuffer, 0);
+        }
+#else                
+        if (kqswnal_data.kqn_shuttingdown)
+                return;
 
-        lib_finalize (&kqswnal_lib, NULL, msg,
-                      (status == EP_SUCCESS) ? PTL_OK : PTL_FAIL);
-        kqswnal_put_idle_tx (ktx);
+        if (krx->krx_rxd == NULL) {
+                /* We had a failed ep_complete_rpc() which nukes the
+                 * descriptor in "old" EKC */
+                int eprc = ep_queue_receive(krx->krx_eprx, 
+                                            kqswnal_rxhandler, krx,
+                                            krx->krx_elanbuffer, 
+                                            krx->krx_npages * PAGE_SIZE, 0);
+                LASSERT (eprc == EP_SUCCESS);
+                /* We don't handle failure here; it's incredibly rare
+                 * (never reported?) and only happens with "old" EKC */
+        } else {
+                ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx,
+                                   krx->krx_elanbuffer, 
+                                   krx->krx_npages * PAGE_SIZE);
+        }
+#endif
 }
 
 void
@@ -1306,97 +1468,74 @@ kqswnal_rpc_complete (EP_RXD *rxd)
 }
 
 void
-kqswnal_requeue_rx (kqswnal_rx_t *krx) 
+kqswnal_rx_done (kqswnal_rx_t *krx) 
 {
-        int   rc;
+        int           rc;
+        EP_STATUSBLK *sblk;
 
         LASSERT (atomic_read(&krx->krx_refcount) == 0);
 
         if (krx->krx_rpc_reply_needed) {
+                /* We've not completed the peer's RPC yet... */
+                sblk = (krx->krx_rpc_reply_status == 0) ? 
+                       &kqswnal_data.kqn_rpc_success : 
+                       &kqswnal_data.kqn_rpc_failed;
 
-                /* We failed to complete the peer's optimized GET (e.g. we
-                 * couldn't map the source buffers).  We complete the
-                 * peer's EKC rpc now with failure. */
+                LASSERT (!in_interrupt());
 #if MULTIRAIL_EKC
-                rc = ep_complete_rpc(krx->krx_rxd, kqswnal_rpc_complete, krx,
-                                     &kqswnal_rpc_failed, NULL, NULL, 0);
+                rc = ep_complete_rpc(krx->krx_rxd, 
+                                     kqswnal_rpc_complete, krx,
+                                     sblk, NULL, NULL, 0);
                 if (rc == EP_SUCCESS)
                         return;
-                
-                CERROR("can't complete RPC: %d\n", rc);
 #else
-                if (krx->krx_rxd != NULL) {
-                        /* We didn't try (and fail) to complete earlier... */
-                        rc = ep_complete_rpc(krx->krx_rxd, 
-                                             kqswnal_rpc_complete, krx,
-                                             &kqswnal_rpc_failed, NULL, 0);
-                        if (rc == EP_SUCCESS)
-                                return;
-
-                        CERROR("can't complete RPC: %d\n", rc);
-                }
-                
-                /* NB the old ep_complete_rpc() frees rxd on failure, so we
-                 * have to requeue from scratch here, unless we're shutting
-                 * down */
-                if (kqswnal_data.kqn_shuttingdown)
+                rc = ep_complete_rpc(krx->krx_rxd, 
+                                     kqswnal_rpc_complete, krx,
+                                     sblk, NULL, 0);
+                if (rc == EP_SUCCESS)
                         return;
 
-                rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
-                                      krx->krx_elanbuffer, 
-                                      krx->krx_npages * PAGE_SIZE, 0);
-                LASSERT (rc == EP_SUCCESS);
-                /* We don't handle failure here; it's incredibly rare
-                 * (never reported?) and only happens with "old" EKC */
-                return;
+                /* "old" EKC destroys rxd on failed completion */
+                krx->krx_rxd = NULL;
 #endif
+                CERROR("can't complete RPC: %d\n", rc);
+                krx->krx_rpc_reply_needed = 0;
         }
 
-#if MULTIRAIL_EKC
-        if (kqswnal_data.kqn_shuttingdown) {
-                /* free EKC rxd on shutdown */
-                ep_complete_receive(krx->krx_rxd);
-        } else {
-                /* repost receive */
-                ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx,
-                                   &krx->krx_elanbuffer, 0);
-        }
-#else                
-        /* don't actually requeue on shutdown */
-        if (!kqswnal_data.kqn_shuttingdown) 
-                ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx,
-                                   krx->krx_elanbuffer, krx->krx_npages * PAGE_SIZE);
-#endif
+        kqswnal_requeue_rx(krx);
 }
         
 void
-kqswnal_rx (kqswnal_rx_t *krx)
+kqswnal_parse (kqswnal_rx_t *krx)
 {
         ptl_hdr_t      *hdr = (ptl_hdr_t *) page_address(krx->krx_kiov[0].kiov_page);
-        ptl_nid_t       dest_nid = NTOH__u64 (hdr->dest_nid);
+        ptl_nid_t       dest_nid = le64_to_cpu(hdr->dest_nid);
         int             payload_nob;
         int             nob;
         int             niov;
 
-        LASSERT (atomic_read(&krx->krx_refcount) == 0);
+        LASSERT (atomic_read(&krx->krx_refcount) == 1);
+
+        if (dest_nid == kqswnal_lib.libnal_ni.ni_pid.nid) { /* It's for me :) */
+                /* I ignore parse errors since I'm not consuming a byte
+                 * stream */
+                (void)lib_parse (&kqswnal_lib, hdr, krx);
 
-        if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */
-                atomic_set(&krx->krx_refcount, 1);
-                lib_parse (&kqswnal_lib, hdr, krx);
-                kqswnal_rx_done(krx);
+                /* Drop my ref; any RDMA activity takes an additional ref */
+                kqswnal_rx_decref(krx);
                 return;
         }
 
 #if KQSW_CHECKSUM
-        CERROR ("checksums for forwarded packets not implemented\n");
-        LBUG ();
+        LASSERTF (0, "checksums for forwarded packets not implemented\n");
 #endif
+
         if (kqswnal_nid2elanid (dest_nid) >= 0)  /* should have gone direct to peer */
         {
                 CERROR("dropping packet from "LPX64" for "LPX64
-                       ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid);
+                       ": target is peer\n", le64_to_cpu(hdr->src_nid), dest_nid);
 
-                kqswnal_requeue_rx (krx);
+                kqswnal_rx_decref (krx);
                 return;
         }
 
@@ -1438,7 +1577,9 @@ kqswnal_rxhandler(EP_RXD *rxd)
                rxd, krx, nob, status);
 
         LASSERT (krx != NULL);
-
+        LASSERT (krx->krx_state = KRX_POSTED);
+        
+        krx->krx_state = KRX_PARSE;
         krx->krx_rxd = rxd;
         krx->krx_nob = nob;
 #if MULTIRAIL_EKC
@@ -1446,7 +1587,10 @@ kqswnal_rxhandler(EP_RXD *rxd)
 #else
         krx->krx_rpc_reply_needed = ep_rxd_isrpc(rxd);
 #endif
-        
+        /* Default to failure if an RPC reply is requested but not handled */
+        krx->krx_rpc_reply_status = -EPROTO;
+        atomic_set (&krx->krx_refcount, 1);
+
         /* must receive a whole header to be able to parse */
         if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t))
         {
@@ -1462,12 +1606,12 @@ kqswnal_rxhandler(EP_RXD *rxd)
                         CERROR("receive status failed with status %d nob %d\n",
                                ep_rxd_status(rxd), nob);
 #endif
-                kqswnal_requeue_rx (krx);
+                kqswnal_rx_decref(krx);
                 return;
         }
 
         if (!in_interrupt()) {
-                kqswnal_rx (krx);
+                kqswnal_parse(krx);
                 return;
         }
 
@@ -1488,30 +1632,30 @@ kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr)
         CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64
                 ", dpid %d, spid %d, type %d\n",
                 ishdr ? "Header" : "Payload", krx,
-                NTOH__u64(hdr->dest_nid), NTOH__u64(hdr->src_nid)
-                NTOH__u32(hdr->dest_pid), NTOH__u32(hdr->src_pid),
-                NTOH__u32(hdr->type));
+                le64_to_cpu(hdr->dest_nid), le64_to_cpu(hdr->src_nid)
+                le32_to_cpu(hdr->dest_pid), le32_to_cpu(hdr->src_pid),
+                le32_to_cpu(hdr->type));
 
-        switch (NTOH__u32 (hdr->type))
+        switch (le32_to_cpu(hdr->type))
         {
         case PTL_MSG_ACK:
                 CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64
                        " len %u\n",
-                       NTOH__u32(hdr->msg.ack.mlength),
+                       le32_to_cpu(hdr->msg.ack.mlength),
                        hdr->msg.ack.dst_wmd.handle_cookie,
                        hdr->msg.ack.dst_wmd.handle_idx,
-                       NTOH__u64(hdr->msg.ack.match_bits),
-                       NTOH__u32(hdr->msg.ack.length));
+                       le64_to_cpu(hdr->msg.ack.match_bits),
+                       le32_to_cpu(hdr->msg.ack.length));
                 break;
         case PTL_MSG_PUT:
                 CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64
                        " len %u off %u data "LPX64"\n",
-                       NTOH__u32(hdr->msg.put.ptl_index),
+                       le32_to_cpu(hdr->msg.put.ptl_index),
                        hdr->msg.put.ack_wmd.handle_cookie,
                        hdr->msg.put.ack_wmd.handle_idx,
-                       NTOH__u64(hdr->msg.put.match_bits),
-                       NTOH__u32(hdr->msg.put.length),
-                       NTOH__u32(hdr->msg.put.offset),
+                       le64_to_cpu(hdr->msg.put.match_bits),
+                       le32_to_cpu(hdr->msg.put.length),
+                       le32_to_cpu(hdr->msg.put.offset),
                        hdr->msg.put.hdr_data);
                 break;
         case PTL_MSG_GET:
@@ -1527,7 +1671,7 @@ kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr)
 #endif
 
 static ptl_err_t
-kqswnal_recvmsg (nal_cb_t     *nal,
+kqswnal_recvmsg (lib_nal_t    *nal,
                  void         *private,
                  lib_msg_t    *libmsg,
                  unsigned int  niov,
@@ -1539,16 +1683,18 @@ kqswnal_recvmsg (nal_cb_t     *nal,
 {
         kqswnal_rx_t *krx = (kqswnal_rx_t *)private;
         char         *buffer = page_address(krx->krx_kiov[0].kiov_page);
+        ptl_hdr_t    *hdr = (ptl_hdr_t *)buffer;
         int           page;
         char         *page_ptr;
         int           page_nob;
         char         *iov_ptr;
         int           iov_nob;
         int           frag;
+        int           rc;
 #if KQSW_CHECKSUM
         kqsw_csum_t   senders_csum;
         kqsw_csum_t   payload_csum = 0;
-        kqsw_csum_t   hdr_csum = kqsw_csum(0, buffer, sizeof(ptl_hdr_t));
+        kqsw_csum_t   hdr_csum = kqsw_csum(0, hdr, sizeof(*hdr));
         size_t        csum_len = mlen;
         int           csum_frags = 0;
         int           csum_nob = 0;
@@ -1561,8 +1707,18 @@ kqswnal_recvmsg (nal_cb_t     *nal,
         if (senders_csum != hdr_csum)
                 kqswnal_csum_error (krx, 1);
 #endif
+        /* NB lib_parse() has already flipped *hdr */
+
         CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen);
 
+        if (krx->krx_rpc_reply_needed &&
+            hdr->type == PTL_MSG_PUT) {
+                /* This must be an optimized PUT */
+                rc = kqswnal_rdma (krx, libmsg, PTL_MSG_PUT,
+                                   niov, iov, kiov, offset, mlen);
+                return (rc == 0 ? PTL_OK : PTL_FAIL);
+        }
+
         /* What was actually received must be >= payload. */
         LASSERT (mlen <= rlen);
         if (krx->krx_nob < KQSW_HDR_SIZE + mlen) {
@@ -1678,7 +1834,7 @@ kqswnal_recvmsg (nal_cb_t     *nal,
 }
 
 static ptl_err_t
-kqswnal_recv(nal_cb_t     *nal,
+kqswnal_recv(lib_nal_t    *nal,
              void         *private,
              lib_msg_t    *libmsg,
              unsigned int  niov,
@@ -1693,7 +1849,7 @@ kqswnal_recv(nal_cb_t     *nal,
 }
 
 static ptl_err_t
-kqswnal_recv_pages (nal_cb_t     *nal,
+kqswnal_recv_pages (lib_nal_t    *nal,
                     void         *private,
                     lib_msg_t    *libmsg,
                     unsigned int  niov,
@@ -1716,7 +1872,6 @@ kqswnal_thread_start (int (*fn)(void *arg), void *arg)
                 return ((int)pid);
 
         atomic_inc (&kqswnal_data.kqn_nthreads);
-        atomic_inc (&kqswnal_data.kqn_nthreads_running);
         return (0);
 }
 
@@ -1735,7 +1890,6 @@ kqswnal_scheduler (void *arg)
         unsigned long    flags;
         int              rc;
         int              counter = 0;
-        int              shuttingdown = 0;
         int              did_something;
 
         kportal_daemonize ("kqswnal_sched");
@@ -1745,18 +1899,6 @@ kqswnal_scheduler (void *arg)
 
         for (;;)
         {
-                if (kqswnal_data.kqn_shuttingdown != shuttingdown) {
-
-                        if (kqswnal_data.kqn_shuttingdown == 2)
-                                break;
-                
-                        /* During stage 1 of shutdown we are still responsive
-                         * to receives */
-
-                        atomic_dec (&kqswnal_data.kqn_nthreads_running);
-                        shuttingdown = kqswnal_data.kqn_shuttingdown;
-                }
-
                 did_something = 0;
 
                 if (!list_empty (&kqswnal_data.kqn_readyrxds))
@@ -1767,14 +1909,24 @@ kqswnal_scheduler (void *arg)
                         spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
                                                flags);
 
-                        kqswnal_rx (krx);
+                        switch (krx->krx_state) {
+                        case KRX_PARSE:
+                                kqswnal_parse (krx);
+                                break;
+                        case KRX_COMPLETING:
+                                /* Drop last ref to reply to RPC and requeue */
+                                LASSERT (krx->krx_rpc_reply_needed);
+                                kqswnal_rx_decref (krx);
+                                break;
+                        default:
+                                LBUG();
+                        }
 
                         did_something = 1;
                         spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
                 }
 
-                if (!shuttingdown &&
-                    !list_empty (&kqswnal_data.kqn_delayedtxds))
+                if (!list_empty (&kqswnal_data.kqn_delayedtxds))
                 {
                         ktx = list_entry(kqswnal_data.kqn_delayedtxds.next,
                                          kqswnal_tx_t, ktx_list);
@@ -1783,31 +1935,31 @@ kqswnal_scheduler (void *arg)
                                                flags);
 
                         rc = kqswnal_launch (ktx);
-                        if (rc != 0)          /* failed: ktx_nid down? */
-                        {
+                        if (rc != 0) {
                                 CERROR("Failed delayed transmit to "LPX64
                                        ": %d\n", ktx->ktx_nid, rc);
                                 kqswnal_tx_done (ktx, rc);
                         }
+                        atomic_dec (&kqswnal_data.kqn_pending_txs);
 
                         did_something = 1;
                         spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
                 }
 
-                if (!shuttingdown &
-                    !list_empty (&kqswnal_data.kqn_delayedfwds))
+                if (!list_empty (&kqswnal_data.kqn_delayedfwds))
                 {
                         fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list);
                         list_del (&fwd->kprfd_list);
                         spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
 
+                        /* If we're shutting down, this will just requeue fwd on kqn_idletxd_fwdq */
                         kqswnal_fwd_packet (NULL, fwd);
 
                         did_something = 1;
                         spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
                 }
 
-                    /* nothing to do or hogging CPU */
+                /* nothing to do or hogging CPU */
                 if (!did_something || counter++ == KQSW_RESCHED) {
                         spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
                                                flags);
@@ -1815,8 +1967,13 @@ kqswnal_scheduler (void *arg)
                         counter = 0;
 
                         if (!did_something) {
+                                if (kqswnal_data.kqn_shuttingdown == 2) {
+                                        /* We only exit in stage 2 of shutdown when 
+                                         * there's nothing left to do */
+                                        break;
+                                }
                                 rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq,
-                                                               kqswnal_data.kqn_shuttingdown != shuttingdown ||
+                                                               kqswnal_data.kqn_shuttingdown == 2 ||
                                                                !list_empty(&kqswnal_data.kqn_readyrxds) ||
                                                                !list_empty(&kqswnal_data.kqn_delayedtxds) ||
                                                                !list_empty(&kqswnal_data.kqn_delayedfwds));
@@ -1828,25 +1985,16 @@ kqswnal_scheduler (void *arg)
                 }
         }
 
-        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
-
         kqswnal_thread_fini ();
         return (0);
 }
 
-nal_cb_t kqswnal_lib =
+lib_nal_t kqswnal_lib =
 {
-        nal_data:       &kqswnal_data,         /* NAL private data */
-        cb_send:        kqswnal_send,
-        cb_send_pages:  kqswnal_send_pages,
-        cb_recv:        kqswnal_recv,
-        cb_recv_pages:  kqswnal_recv_pages,
-        cb_read:        kqswnal_read,
-        cb_write:       kqswnal_write,
-        cb_malloc:      kqswnal_malloc,
-        cb_free:        kqswnal_free,
-        cb_printf:      kqswnal_printf,
-        cb_cli:         kqswnal_cli,
-        cb_sti:         kqswnal_sti,
-        cb_dist:        kqswnal_dist
+        libnal_data:       &kqswnal_data,         /* NAL private data */
+        libnal_send:        kqswnal_send,
+        libnal_send_pages:  kqswnal_send_pages,
+        libnal_recv:        kqswnal_recv,
+        libnal_recv_pages:  kqswnal_recv_pages,
+        libnal_dist:        kqswnal_dist
 };
index bbe19cf..7642770 100644 (file)
 
 #include "socknal.h"
 
+nal_t                   ksocknal_api;
+ksock_nal_data_t        ksocknal_data;
 ptl_handle_ni_t         ksocknal_ni;
-static nal_t            ksocknal_api;
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-ksock_nal_data_t ksocknal_data;
-#else
-static ksock_nal_data_t ksocknal_data;
-#endif
+ksock_tunables_t        ksocknal_tunables;
 
 kpr_nal_interface_t ksocknal_router_interface = {
         kprni_nalid:      SOCKNAL,
@@ -40,31 +37,58 @@ kpr_nal_interface_t ksocknal_router_interface = {
         kprni_notify:     ksocknal_notify,
 };
 
+#ifdef CONFIG_SYSCTL
 #define SOCKNAL_SYSCTL 200
 
-#define SOCKNAL_SYSCTL_TIMEOUT     1
-#define SOCKNAL_SYSCTL_EAGER_ACK   2
-#define SOCKNAL_SYSCTL_ZERO_COPY   3
-#define SOCKNAL_SYSCTL_TYPED       4
-#define SOCKNAL_SYSCTL_MIN_BULK    5
+#define SOCKNAL_SYSCTL_TIMEOUT          1
+#define SOCKNAL_SYSCTL_EAGER_ACK        2
+#define SOCKNAL_SYSCTL_ZERO_COPY        3
+#define SOCKNAL_SYSCTL_TYPED            4
+#define SOCKNAL_SYSCTL_MIN_BULK         5
+#define SOCKNAL_SYSCTL_BUFFER_SIZE      6
+#define SOCKNAL_SYSCTL_NAGLE            7
+#define SOCKNAL_SYSCTL_IRQ_AFFINITY     8
+#define SOCKNAL_SYSCTL_KEEPALIVE_IDLE   9
+#define SOCKNAL_SYSCTL_KEEPALIVE_COUNT 10
+#define SOCKNAL_SYSCTL_KEEPALIVE_INTVL 11
 
 static ctl_table ksocknal_ctl_table[] = {
         {SOCKNAL_SYSCTL_TIMEOUT, "timeout", 
-         &ksocknal_data.ksnd_io_timeout, sizeof (int),
+         &ksocknal_tunables.ksnd_io_timeout, sizeof (int),
          0644, NULL, &proc_dointvec},
         {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack", 
-         &ksocknal_data.ksnd_eager_ack, sizeof (int),
+         &ksocknal_tunables.ksnd_eager_ack, sizeof (int),
          0644, NULL, &proc_dointvec},
 #if SOCKNAL_ZC
         {SOCKNAL_SYSCTL_ZERO_COPY, "zero_copy", 
-         &ksocknal_data.ksnd_zc_min_frag, sizeof (int),
+         &ksocknal_tunables.ksnd_zc_min_frag, sizeof (int),
          0644, NULL, &proc_dointvec},
 #endif
         {SOCKNAL_SYSCTL_TYPED, "typed", 
-         &ksocknal_data.ksnd_typed_conns, sizeof (int),
+         &ksocknal_tunables.ksnd_typed_conns, sizeof (int),
          0644, NULL, &proc_dointvec},
         {SOCKNAL_SYSCTL_MIN_BULK, "min_bulk", 
-         &ksocknal_data.ksnd_min_bulk, sizeof (int),
+         &ksocknal_tunables.ksnd_min_bulk, sizeof (int),
+         0644, NULL, &proc_dointvec},
+        {SOCKNAL_SYSCTL_BUFFER_SIZE, "buffer_size",
+         &ksocknal_tunables.ksnd_buffer_size, sizeof(int),
+         0644, NULL, &proc_dointvec},
+        {SOCKNAL_SYSCTL_NAGLE, "nagle",
+         &ksocknal_tunables.ksnd_nagle, sizeof(int),
+         0644, NULL, &proc_dointvec},
+#if CPU_AFFINITY
+        {SOCKNAL_SYSCTL_IRQ_AFFINITY, "irq_affinity",
+         &ksocknal_tunables.ksnd_irq_affinity, sizeof(int),
+         0644, NULL, &proc_dointvec},
+#endif
+        {SOCKNAL_SYSCTL_KEEPALIVE_IDLE, "keepalive_idle",
+         &ksocknal_tunables.ksnd_keepalive_idle, sizeof(int),
+         0644, NULL, &proc_dointvec},
+        {SOCKNAL_SYSCTL_KEEPALIVE_COUNT, "keepalive_count",
+         &ksocknal_tunables.ksnd_keepalive_count, sizeof(int),
+         0644, NULL, &proc_dointvec},
+        {SOCKNAL_SYSCTL_KEEPALIVE_INTVL, "keepalive_intvl",
+         &ksocknal_tunables.ksnd_keepalive_intvl, sizeof(int),
          0644, NULL, &proc_dointvec},
         { 0 }
 };
@@ -73,73 +97,12 @@ static ctl_table ksocknal_top_ctl_table[] = {
         {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table},
         { 0 }
 };
-
-int
-ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
-                       void *ret, size_t ret_len)
-{
-        ksock_nal_data_t *k;
-        nal_cb_t *nal_cb;
-
-        k = nal->nal_data;
-        nal_cb = k->ksnd_nal_cb;
-
-        lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */
-        return PTL_OK;
-}
-
-int
-ksocknal_api_shutdown(nal_t *nal, int ni)
-{
-        return PTL_OK;
-}
-
-void
-ksocknal_api_yield(nal_t *nal)
-{
-        our_cond_resched();
-        return;
-}
-
-void
-ksocknal_api_lock(nal_t *nal, unsigned long *flags)
-{
-        ksock_nal_data_t *k;
-        nal_cb_t *nal_cb;
-
-        k = nal->nal_data;
-        nal_cb = k->ksnd_nal_cb;
-        nal_cb->cb_cli(nal_cb,flags);
-}
-
-void
-ksocknal_api_unlock(nal_t *nal, unsigned long *flags)
-{
-        ksock_nal_data_t *k;
-        nal_cb_t *nal_cb;
-
-        k = nal->nal_data;
-        nal_cb = k->ksnd_nal_cb;
-        nal_cb->cb_sti(nal_cb,flags);
-}
-
-nal_t *
-ksocknal_init(int interface, ptl_pt_index_t ptl_size,
-              ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
-{
-        CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", (ptl_nid_t)0);
-        lib_init(&ksocknal_lib, (ptl_nid_t)0, 0, 10, ptl_size, ac_size);
-        return (&ksocknal_api);
-}
-
-/*
- *  EXTRA functions follow
- */
+#endif
 
 int
 ksocknal_set_mynid(ptl_nid_t nid)
 {
-        lib_ni_t *ni = &ksocknal_lib.ni;
+        lib_ni_t *ni = &ksocknal_lib.libnal_ni;
 
         /* FIXME: we have to do this because we call lib_init() at module
          * insertion time, which is before we have 'mynid' available.  lib_init
@@ -148,9 +111,9 @@ ksocknal_set_mynid(ptl_nid_t nid)
          * problem. */
 
         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
-               nid, ni->nid);
+               nid, ni->ni_pid.nid);
 
-        ni->nid = nid;
+        ni->ni_pid.nid = nid;
         return (0);
 }
 
@@ -202,9 +165,25 @@ ksocknal_bind_irq (unsigned int irq)
 #endif
 }
 
+ksock_interface_t *
+ksocknal_ip2iface(__u32 ip)
+{
+        int                i;
+        ksock_interface_t *iface;
+
+        for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) {
+                LASSERT(i < SOCKNAL_MAX_INTERFACES);
+                iface = &ksocknal_data.ksnd_interfaces[i];
+                
+                if (iface->ksni_ipaddr == ip)
+                        return (iface);
+        }
+        
+        return (NULL);
+}
+
 ksock_route_t *
-ksocknal_create_route (__u32 ipaddr, int port, int buffer_size,
-                       int irq_affinity, int eager)
+ksocknal_create_route (__u32 ipaddr, int port)
 {
         ksock_route_t *route;
 
@@ -213,19 +192,16 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size,
                 return (NULL);
 
         atomic_set (&route->ksnr_refcount, 1);
-        route->ksnr_sharecount = 0;
         route->ksnr_peer = NULL;
         route->ksnr_timeout = jiffies;
         route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL;
         route->ksnr_ipaddr = ipaddr;
         route->ksnr_port = port;
-        route->ksnr_buffer_size = buffer_size;
-        route->ksnr_irq_affinity = irq_affinity;
-        route->ksnr_eager = eager;
         route->ksnr_connecting = 0;
         route->ksnr_connected = 0;
         route->ksnr_deleted = 0;
         route->ksnr_conn_count = 0;
+        route->ksnr_share_count = 0;
 
         return (route);
 }
@@ -233,8 +209,6 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size,
 void
 ksocknal_destroy_route (ksock_route_t *route)
 {
-        LASSERT (route->ksnr_sharecount == 0);
-
         if (route->ksnr_peer != NULL)
                 ksocknal_put_peer (route->ksnr_peer);
 
@@ -265,7 +239,7 @@ ksocknal_create_peer (ptl_nid_t nid)
         if (peer == NULL)
                 return (NULL);
 
-        memset (peer, 0, sizeof (*peer));
+        memset (peer, 0, sizeof (*peer));       /* NULL pointers/clear flags etc */
 
         peer->ksnp_nid = nid;
         atomic_set (&peer->ksnp_refcount, 1);   /* 1 ref for caller */
@@ -323,8 +297,6 @@ ksocknal_find_peer_locked (ptl_nid_t nid)
                 peer = list_entry (tmp, ksock_peer_t, ksnp_list);
 
                 LASSERT (!peer->ksnp_closing);
-                LASSERT (!(list_empty (&peer->ksnp_routes) &&
-                           list_empty (&peer->ksnp_conns)));
 
                 if (peer->ksnp_nid != nid)
                         continue;
@@ -353,6 +325,18 @@ ksocknal_get_peer (ptl_nid_t nid)
 void
 ksocknal_unlink_peer_locked (ksock_peer_t *peer)
 {
+        int                i;
+        __u32              ip;
+
+        for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+                LASSERT (i < SOCKNAL_MAX_INTERFACES);
+                ip = peer->ksnp_passive_ips[i];
+
+                ksocknal_ip2iface(ip)->ksni_npeers--;
+        }
+
+        LASSERT (list_empty(&peer->ksnp_conns));
+        LASSERT (list_empty(&peer->ksnp_routes));
         LASSERT (!peer->ksnp_closing);
         peer->ksnp_closing = 1;
         list_del (&peer->ksnp_list);
@@ -360,49 +344,210 @@ ksocknal_unlink_peer_locked (ksock_peer_t *peer)
         ksocknal_put_peer (peer);
 }
 
-ksock_route_t *
-ksocknal_get_route_by_idx (int index)
+int
+ksocknal_get_peer_info (int index, ptl_nid_t *nid,
+                        __u32 *myip, __u32 *peer_ip, int *port, 
+                        int *conn_count, int *share_count)
 {
         ksock_peer_t      *peer;
         struct list_head  *ptmp;
         ksock_route_t     *route;
         struct list_head  *rtmp;
         int                i;
+        int                j;
+        int                rc = -ENOENT;
 
         read_lock (&ksocknal_data.ksnd_global_lock);
 
         for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                
                 list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
                         peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
 
-                        LASSERT (!(list_empty (&peer->ksnp_routes) &&
-                                   list_empty (&peer->ksnp_conns)));
+                        if (peer->ksnp_n_passive_ips == 0 &&
+                            list_empty(&peer->ksnp_routes)) {
+                                if (index-- > 0)
+                                        continue;
+                                
+                                *nid = peer->ksnp_nid;
+                                *myip = 0;
+                                *peer_ip = 0;
+                                *port = 0;
+                                *conn_count = 0;
+                                *share_count = 0;
+                                rc = 0;
+                                goto out;
+                        }
 
+                        for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+                                if (index-- > 0)
+                                        continue;
+                                
+                                *nid = peer->ksnp_nid;
+                                *myip = peer->ksnp_passive_ips[j];
+                                *peer_ip = 0;
+                                *port = 0;
+                                *conn_count = 0;
+                                *share_count = 0;
+                                rc = 0;
+                                goto out;
+                        }
+                        
                         list_for_each (rtmp, &peer->ksnp_routes) {
                                 if (index-- > 0)
                                         continue;
 
-                                route = list_entry (rtmp, ksock_route_t, ksnr_list);
-                                atomic_inc (&route->ksnr_refcount);
-                                read_unlock (&ksocknal_data.ksnd_global_lock);
-                                return (route);
+                                route = list_entry(rtmp, ksock_route_t,
+                                                   ksnr_list);
+
+                                *nid = peer->ksnp_nid;
+                                *myip = route->ksnr_myipaddr;
+                                *peer_ip = route->ksnr_ipaddr;
+                                *port = route->ksnr_port;
+                                *conn_count = route->ksnr_conn_count;
+                                *share_count = route->ksnr_share_count;
+                                rc = 0;
+                                goto out;
                         }
                 }
         }
-
+ out:
         read_unlock (&ksocknal_data.ksnd_global_lock);
-        return (NULL);
+        return (rc);
+}
+
+void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+        ksock_peer_t      *peer = route->ksnr_peer;
+        int                type = conn->ksnc_type;
+        ksock_interface_t *iface;
+
+        conn->ksnc_route = route;
+        atomic_inc (&route->ksnr_refcount);
+
+        if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+                if (route->ksnr_myipaddr == 0) {
+                        /* route wasn't bound locally yet (the initial route) */
+                        CWARN("Binding "LPX64" %u.%u.%u.%u to %u.%u.%u.%u\n",
+                              peer->ksnp_nid, 
+                              HIPQUAD(route->ksnr_ipaddr),
+                              HIPQUAD(conn->ksnc_myipaddr));
+                } else {
+                        CWARN("Rebinding "LPX64" %u.%u.%u.%u from "
+                              "%u.%u.%u.%u to %u.%u.%u.%u\n",
+                              peer->ksnp_nid, 
+                              HIPQUAD(route->ksnr_ipaddr),
+                              HIPQUAD(route->ksnr_myipaddr),
+                              HIPQUAD(conn->ksnc_myipaddr));
+                        
+                        iface = ksocknal_ip2iface(route->ksnr_myipaddr);
+                        if (iface != NULL) 
+                                iface->ksni_nroutes--;
+                }
+                route->ksnr_myipaddr = conn->ksnc_myipaddr;
+                iface = ksocknal_ip2iface(route->ksnr_myipaddr);
+                if (iface != NULL) 
+                        iface->ksni_nroutes++;
+        }
+
+        route->ksnr_connected |= (1<<type);
+        route->ksnr_connecting &= ~(1<<type);
+        route->ksnr_conn_count++;
+
+        /* Successful connection => further attempts can
+         * proceed immediately */
+        route->ksnr_timeout = jiffies;
+        route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL;
+}
+
+void
+ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
+{
+        struct list_head  *tmp;
+        ksock_conn_t      *conn;
+        int                type;
+        ksock_route_t     *route2;
+
+        LASSERT (route->ksnr_peer == NULL);
+        LASSERT (route->ksnr_connecting == 0);
+        LASSERT (route->ksnr_connected == 0);
+
+        /* LASSERT(unique) */
+        list_for_each(tmp, &peer->ksnp_routes) {
+                route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+                        CERROR ("Duplicate route "LPX64" %u.%u.%u.%u\n",
+                                peer->ksnp_nid, HIPQUAD(route->ksnr_ipaddr));
+                        LBUG();
+                }
+        }
+
+        route->ksnr_peer = peer;
+        atomic_inc (&peer->ksnp_refcount);
+        /* peer's routelist takes over my ref on 'route' */
+        list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+        
+        list_for_each(tmp, &peer->ksnp_conns) {
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                type = conn->ksnc_type;
+
+                if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+                        continue;
+
+                ksocknal_associate_route_conn_locked(route, conn);
+                /* keep going (typed routes) */
+        }
+}
+
+void
+ksocknal_del_route_locked (ksock_route_t *route)
+{
+        ksock_peer_t      *peer = route->ksnr_peer;
+        ksock_interface_t *iface;
+        ksock_conn_t      *conn;
+        struct list_head  *ctmp;
+        struct list_head  *cnxt;
+
+        LASSERT (!route->ksnr_deleted);
+
+        /* Close associated conns */
+        list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+                conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_route != route)
+                        continue;
+                
+                ksocknal_close_conn_locked (conn, 0);
+        }
+
+        if (route->ksnr_myipaddr != 0) {
+                iface = ksocknal_ip2iface(route->ksnr_myipaddr);
+                if (iface != NULL)
+                        iface->ksni_nroutes--;
+        }
+
+        route->ksnr_deleted = 1;
+        list_del (&route->ksnr_list);
+        ksocknal_put_route (route);             /* drop peer's ref */
+
+        if (list_empty (&peer->ksnp_routes) &&
+            list_empty (&peer->ksnp_conns)) {
+                /* I've just removed the last autoconnect route of a peer
+                 * with no active connections */
+                ksocknal_unlink_peer_locked (peer);
+        }
 }
 
 int
-ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob,
-                    int bind_irq, int share, int eager)
+ksocknal_add_peer (ptl_nid_t nid, __u32 ipaddr, int port)
 {
         unsigned long      flags;
+        struct list_head  *tmp;
         ksock_peer_t      *peer;
         ksock_peer_t      *peer2;
         ksock_route_t     *route;
-        struct list_head  *rtmp;
         ksock_route_t     *route2;
         
         if (nid == PTL_NID_ANY)
@@ -413,8 +558,7 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob,
         if (peer == NULL)
                 return (-ENOMEM);
 
-        route = ksocknal_create_route (ipaddr, port, bufnob, 
-                                       bind_irq, eager);
+        route = ksocknal_create_route (ipaddr, port);
         if (route == NULL) {
                 ksocknal_put_peer (peer);
                 return (-ENOMEM);
@@ -427,36 +571,27 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob,
                 ksocknal_put_peer (peer);
                 peer = peer2;
         } else {
-                /* peer table takes existing ref on peer */
-                list_add (&peer->ksnp_list,
-                          ksocknal_nid2peerlist (nid));
+                /* peer table takes my ref on peer */
+                list_add_tail (&peer->ksnp_list,
+                               ksocknal_nid2peerlist (nid));
         }
 
         route2 = NULL;
-        if (share) {
-                /* check for existing route to this NID via this ipaddr */
-                list_for_each (rtmp, &peer->ksnp_routes) {
-                        route2 = list_entry (rtmp, ksock_route_t, ksnr_list);
-                        
-                        if (route2->ksnr_ipaddr == ipaddr)
-                                break;
-
-                        route2 = NULL;
-                }
+        list_for_each (tmp, &peer->ksnp_routes) {
+                route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+                
+                if (route2->ksnr_ipaddr == ipaddr)
+                        break;
+                
+                route2 = NULL;
         }
-
-        if (route2 != NULL) {
-                ksocknal_put_route (route);
-                route = route2;
+        if (route2 == NULL) {
+                ksocknal_add_route_locked(peer, route);
+                route->ksnr_share_count++;
         } else {
-                /* route takes a ref on peer */
-                route->ksnr_peer = peer;
-                atomic_inc (&peer->ksnp_refcount);
-                /* peer's route list takes existing ref on route */
-                list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
+                ksocknal_put_route(route);
+                route2->ksnr_share_count++;
         }
-        
-        route->ksnr_sharecount++;
 
         write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
 
@@ -464,59 +599,75 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob,
 }
 
 void
-ksocknal_del_route_locked (ksock_route_t *route, int share, int keep_conn)
+ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share)
 {
-        ksock_peer_t     *peer = route->ksnr_peer;
         ksock_conn_t     *conn;
-        struct list_head *ctmp;
-        struct list_head *cnxt;
+        ksock_route_t    *route;
+        struct list_head *tmp;
+        struct list_head *nxt;
+        int               nshared;
 
-        if (!share)
-                route->ksnr_sharecount = 0;
-        else {
-                route->ksnr_sharecount--;
-                if (route->ksnr_sharecount != 0)
-                        return;
-        }
+        LASSERT (!peer->ksnp_closing);
 
-        list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
-                conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+        list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+                route = list_entry(tmp, ksock_route_t, ksnr_list);
 
-                if (conn->ksnc_route != route)
+                if (single_share && route->ksnr_share_count == 0)
                         continue;
-                
-                if (!keep_conn) {
-                        ksocknal_close_conn_locked (conn, 0);
+
+                /* no match */
+                if (!(ip == 0 || route->ksnr_ipaddr == ip))
                         continue;
+
+                if (!single_share)
+                        route->ksnr_share_count = 0;
+                else if (route->ksnr_share_count > 0)
+                        route->ksnr_share_count--;
+
+                if (route->ksnr_share_count == 0) {
+                        /* This deletes associated conns too */
+                        ksocknal_del_route_locked (route);
                 }
                 
-                /* keeping the conn; just dissociate it and route... */
-                conn->ksnc_route = NULL;
-                ksocknal_put_route (route); /* drop conn's ref on route */
+                if (single_share)
+                        break;
         }
-        
-        route->ksnr_deleted = 1;
-        list_del (&route->ksnr_list);
-        ksocknal_put_route (route);             /* drop peer's ref */
 
-        if (list_empty (&peer->ksnp_routes) &&
-            list_empty (&peer->ksnp_conns)) {
-                /* I've just removed the last autoconnect route of a peer
-                 * with no active connections */
-                ksocknal_unlink_peer_locked (peer);
+        nshared = 0;
+        list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+                route = list_entry(tmp, ksock_route_t, ksnr_list);
+                nshared += route->ksnr_share_count;
+        }
+                        
+        if (nshared == 0) {
+                /* remove everything else if there are no explicit entries
+                 * left */
+
+                list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+                        route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                        /* we should only be removing auto-entries */
+                        LASSERT(route->ksnr_share_count == 0);
+                        ksocknal_del_route_locked (route);
+                }
+
+                list_for_each_safe (tmp, nxt, &peer->ksnp_conns) {
+                        conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                        ksocknal_close_conn_locked(conn, 0);
+                }
         }
+                
+        /* NB peer unlinks itself when last conn/route is removed */
 }
 
 int
-ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, int share, int keep_conn)
+ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share)
 {
         unsigned long      flags;
         struct list_head  *ptmp;
         struct list_head  *pnxt;
         ksock_peer_t      *peer;
-        struct list_head  *rtmp;
-        struct list_head  *rnxt;
-        ksock_route_t     *route;
         int                lo;
         int                hi;
         int                i;
@@ -538,22 +689,14 @@ ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, int share, int keep_conn)
                         if (!(nid == PTL_NID_ANY || peer->ksnp_nid == nid))
                                 continue;
 
-                        list_for_each_safe (rtmp, rnxt, &peer->ksnp_routes) {
-                                route = list_entry (rtmp, ksock_route_t,
-                                                    ksnr_list);
-
-                                if (!(ipaddr == 0 ||
-                                      route->ksnr_ipaddr == ipaddr))
-                                        continue;
+                        ksocknal_del_peer_locked (peer, ip, single_share);
+                        rc = 0;                 /* matched! */
 
-                                ksocknal_del_route_locked (route, share, keep_conn);
-                                rc = 0;         /* matched something */
-                                if (share)
-                                        goto out;
-                        }
+                        if (single_share)
+                                break;
                 }
         }
- out:
+
         write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
 
         return (rc);
@@ -574,8 +717,7 @@ ksocknal_get_conn_by_idx (int index)
                 list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
                         peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
 
-                        LASSERT (!(list_empty (&peer->ksnp_routes) &&
-                                   list_empty (&peer->ksnp_conns)));
+                        LASSERT (!peer->ksnp_closing);
 
                         list_for_each (ctmp, &peer->ksnp_conns) {
                                 if (index-- > 0)
@@ -593,8 +735,8 @@ ksocknal_get_conn_by_idx (int index)
         return (NULL);
 }
 
-void
-ksocknal_get_peer_addr (ksock_conn_t *conn)
+int
+ksocknal_get_conn_addrs (ksock_conn_t *conn)
 {
         struct sockaddr_in sin;
         int                len = sizeof (sin);
@@ -604,24 +746,37 @@ ksocknal_get_peer_addr (ksock_conn_t *conn)
                                             (struct sockaddr *)&sin, &len, 2);
         /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
         LASSERT (!conn->ksnc_closing);
-        LASSERT (len <= sizeof (sin));
 
         if (rc != 0) {
                 CERROR ("Error %d getting sock peer IP\n", rc);
-                return;
+                return rc;
         }
 
         conn->ksnc_ipaddr = ntohl (sin.sin_addr.s_addr);
         conn->ksnc_port   = ntohs (sin.sin_port);
+
+        rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock,
+                                            (struct sockaddr *)&sin, &len, 0);
+        if (rc != 0) {
+                CERROR ("Error %d getting sock local IP\n", rc);
+                return rc;
+        }
+
+        conn->ksnc_myipaddr = ntohl (sin.sin_addr.s_addr);
+
+        return 0;
 }
 
 unsigned int
-ksocknal_conn_irq (ksock_conn_t *conn)
+ksocknal_sock_irq (struct socket *sock)
 {
         int                irq = 0;
         struct dst_entry  *dst;
 
-        dst = sk_dst_get (conn->ksnc_sock->sk);
+        if (!ksocknal_tunables.ksnd_irq_affinity)
+                return 0;
+
+        dst = sk_dst_get (sock->sk);
         if (dst != NULL) {
                 if (dst->dev != NULL) {
                         irq = dst->dev->irq;
@@ -633,8 +788,6 @@ ksocknal_conn_irq (ksock_conn_t *conn)
                 dst_release (dst);
         }
         
-        /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
-        LASSERT (!conn->ksnc_closing);
         return (irq);
 }
 
@@ -656,7 +809,7 @@ ksocknal_choose_scheduler_locked (unsigned int irq)
         /* software NIC (irq == 0) || not associated with a scheduler yet.
          * Choose the CPU with the fewest connections... */
         sched = &ksocknal_data.ksnd_schedulers[0];
-        for (i = 1; i < SOCKNAL_N_SCHED; i++)
+        for (i = 1; i < ksocknal_data.ksnd_nschedulers; i++)
                 if (sched->kss_nconns >
                     ksocknal_data.ksnd_schedulers[i].kss_nconns)
                         sched = &ksocknal_data.ksnd_schedulers[i];
@@ -665,22 +818,286 @@ ksocknal_choose_scheduler_locked (unsigned int irq)
                 info->ksni_valid = 1;
                 info->ksni_sched = sched - ksocknal_data.ksnd_schedulers;
 
-                /* no overflow... */
-                LASSERT (info->ksni_sched == sched - ksocknal_data.ksnd_schedulers);
+                /* no overflow... */
+                LASSERT (info->ksni_sched == sched - ksocknal_data.ksnd_schedulers);
+        }
+
+        return (sched);
+}
+
+int
+ksocknal_local_ipvec (__u32 *ipaddrs)
+{
+        int                i;
+        int                nip;
+
+        read_lock (&ksocknal_data.ksnd_global_lock);
+
+        nip = ksocknal_data.ksnd_ninterfaces;
+        for (i = 0; i < nip; i++) {
+                LASSERT (i < SOCKNAL_MAX_INTERFACES);
+
+                ipaddrs[i] = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr;
+                LASSERT (ipaddrs[i] != 0);
+        }
+        
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+        return (nip);
+}
+
+int
+ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+{
+        int   best_netmatch = 0;
+        int   best_xor      = 0;
+        int   best          = -1;
+        int   this_xor;
+        int   this_netmatch;
+        int   i;
+        
+        for (i = 0; i < nips; i++) {
+                if (ips[i] == 0)
+                        continue;
+
+                this_xor = (ips[i] ^ iface->ksni_ipaddr);
+                this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+                
+                if (!(best < 0 ||
+                      best_netmatch < this_netmatch ||
+                      (best_netmatch == this_netmatch && 
+                       best_xor > this_xor)))
+                        continue;
+                
+                best = i;
+                best_netmatch = this_netmatch;
+                best_xor = this_xor;
+        }
+        
+        LASSERT (best >= 0);
+        return (best);
+}
+
+int
+ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+{
+        rwlock_t           *global_lock = &ksocknal_data.ksnd_global_lock;
+        unsigned long       flags;
+        ksock_interface_t  *iface;
+        ksock_interface_t  *best_iface;
+        int                 n_ips;
+        int                 i;
+        int                 j;
+        int                 k;
+        __u32               ip;
+        __u32               xor;
+        int                 this_netmatch;
+        int                 best_netmatch;
+        int                 best_npeers;
+
+        /* CAVEAT EMPTOR: We do all our interface matching with an
+         * exclusive hold of global lock at IRQ priority.  We're only
+         * expecting to be dealing with small numbers of interfaces, so the
+         * O(n**3)-ness shouldn't matter */
+
+        /* Also note that I'm not going to return more than n_peerips
+         * interfaces, even if I have more myself */
+        
+        write_lock_irqsave(global_lock, flags);
+
+        LASSERT (n_peerips <= SOCKNAL_MAX_INTERFACES);
+        LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES);
+
+        n_ips = MIN(n_peerips, ksocknal_data.ksnd_ninterfaces);
+
+        for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+                /*              ^ yes really... */
+
+                /* If we have any new interfaces, first tick off all the
+                 * peer IPs that match old interfaces, then choose new
+                 * interfaces to match the remaining peer IPS. 
+                 * We don't forget interfaces we've stopped using; we might
+                 * start using them again... */
+                
+                if (i < peer->ksnp_n_passive_ips) {
+                        /* Old interface. */
+                        ip = peer->ksnp_passive_ips[i];
+                        best_iface = ksocknal_ip2iface(ip);
+
+                        /* peer passive ips are kept up to date */
+                        LASSERT(best_iface != NULL);
+                } else {
+                        /* choose a new interface */
+                        LASSERT (i == peer->ksnp_n_passive_ips);
+
+                        best_iface = NULL;
+                        best_netmatch = 0;
+                        best_npeers = 0;
+                        
+                        for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) {
+                                iface = &ksocknal_data.ksnd_interfaces[j];
+                                ip = iface->ksni_ipaddr;
+
+                                for (k = 0; k < peer->ksnp_n_passive_ips; k++)
+                                        if (peer->ksnp_passive_ips[k] == ip)
+                                                break;
+                        
+                                if (k < peer->ksnp_n_passive_ips) /* using it already */
+                                        continue;
+
+                                k = ksocknal_match_peerip(iface, peerips, n_peerips);
+                                xor = (ip ^ peerips[k]);
+                                this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+                                if (!(best_iface == NULL ||
+                                      best_netmatch < this_netmatch ||
+                                      (best_netmatch == this_netmatch &&
+                                       best_npeers > iface->ksni_npeers)))
+                                        continue;
+
+                                best_iface = iface;
+                                best_netmatch = this_netmatch;
+                                best_npeers = iface->ksni_npeers;
+                        }
+
+                        best_iface->ksni_npeers++;
+                        ip = best_iface->ksni_ipaddr;
+                        peer->ksnp_passive_ips[i] = ip;
+                        peer->ksnp_n_passive_ips = i+1;
+                }
+                
+                LASSERT (best_iface != NULL);
+
+                /* mark the best matching peer IP used */
+                j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+                peerips[j] = 0;
+        }
+        
+        /* Overwrite input peer IP addresses */
+        memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+        
+        write_unlock_irqrestore(global_lock, flags);
+        
+        return (n_ips);
+}
+
+void
+ksocknal_create_routes(ksock_peer_t *peer, int port, 
+                       __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+        ksock_route_t      *newroute = NULL;
+        rwlock_t           *global_lock = &ksocknal_data.ksnd_global_lock;
+        unsigned long       flags;
+        struct list_head   *rtmp;
+        ksock_route_t      *route;
+        ksock_interface_t  *iface;
+        ksock_interface_t  *best_iface;
+        int                 best_netmatch;
+        int                 this_netmatch;
+        int                 best_nroutes;
+        int                 i;
+        int                 j;
+
+        /* CAVEAT EMPTOR: We do all our interface matching with an
+         * exclusive hold of global lock at IRQ priority.  We're only
+         * expecting to be dealing with small numbers of interfaces, so the
+         * O(n**3)-ness here shouldn't matter */
+
+        write_lock_irqsave(global_lock, flags);
+
+        LASSERT (npeer_ipaddrs <= SOCKNAL_MAX_INTERFACES);
+        
+        for (i = 0; i < npeer_ipaddrs; i++) {
+                if (newroute != NULL) {
+                        newroute->ksnr_ipaddr = peer_ipaddrs[i];
+                } else {
+                        write_unlock_irqrestore(global_lock, flags);
+
+                        newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+                        if (newroute == NULL)
+                                return;
+
+                        write_lock_irqsave(global_lock, flags);
+                }
+                
+                /* Already got a route? */
+                route = NULL;
+                list_for_each(rtmp, &peer->ksnp_routes) {
+                        route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+                        if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+                                break;
+                        
+                        route = NULL;
+                }
+                if (route != NULL)
+                        continue;
+
+                best_iface = NULL;
+                best_nroutes = 0;
+                best_netmatch = 0;
+
+                LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES);
+
+                /* Select interface to connect from */
+                for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) {
+                        iface = &ksocknal_data.ksnd_interfaces[j];
+
+                        /* Using this interface already? */
+                        list_for_each(rtmp, &peer->ksnp_routes) {
+                                route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+                                if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+                                        break;
+
+                                route = NULL;
+                        }
+                        if (route != NULL)
+                                continue;
+
+                        this_netmatch = (((iface->ksni_ipaddr ^ 
+                                           newroute->ksnr_ipaddr) & 
+                                           iface->ksni_netmask) == 0) ? 1 : 0;
+                        
+                        if (!(best_iface == NULL ||
+                              best_netmatch < this_netmatch ||
+                              (best_netmatch == this_netmatch &&
+                               best_nroutes > iface->ksni_nroutes)))
+                                continue;
+                        
+                        best_iface = iface;
+                        best_netmatch = this_netmatch;
+                        best_nroutes = iface->ksni_nroutes;
+                }
+                
+                if (best_iface == NULL)
+                        continue;
+
+                newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+                best_iface->ksni_nroutes++;
+
+                ksocknal_add_route_locked(peer, newroute);
+                newroute = NULL;
         }
-
-        return (sched);
+        
+        write_unlock_irqrestore(global_lock, flags);
+        if (newroute != NULL)
+                ksocknal_put_route(newroute);
 }
 
 int
-ksocknal_create_conn (ksock_route_t *route, struct socket *sock,
-                      int bind_irq, int type)
+ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
 {
+        int                passive = (type == SOCKNAL_CONN_NONE);
+        rwlock_t          *global_lock = &ksocknal_data.ksnd_global_lock;
+        __u32              ipaddrs[SOCKNAL_MAX_INTERFACES];
+        int                nipaddrs;
         ptl_nid_t          nid;
+        struct list_head  *tmp;
         __u64              incarnation;
         unsigned long      flags;
         ksock_conn_t      *conn;
-        ksock_peer_t      *peer;
+        ksock_conn_t      *conn2;
+        ksock_peer_t      *peer = NULL;
         ksock_peer_t      *peer2;
         ksock_sched_t     *sched;
         unsigned int       irq;
@@ -693,45 +1110,23 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock,
          * it, and sock->file has that pre-cooked... */
         LASSERT (sock->file != NULL);
         LASSERT (file_count(sock->file) > 0);
+        LASSERT (route == NULL || !passive);
 
         rc = ksocknal_setup_sock (sock);
         if (rc != 0)
                 return (rc);
 
-        if (route == NULL) {
-                /* acceptor or explicit connect */
-                nid = PTL_NID_ANY;
-        } else {
-                LASSERT (type != SOCKNAL_CONN_NONE);
-                /* autoconnect: expect this nid on exchange */
-                nid = route->ksnr_peer->ksnp_nid;
-        }
-
-        rc = ksocknal_hello (sock, &nid, &type, &incarnation);
-        if (rc != 0)
-                return (rc);
-        
-        peer = NULL;
-        if (route == NULL) {                    /* not autoconnect */
-                /* Assume this socket connects to a brand new peer */
-                peer = ksocknal_create_peer (nid);
-                if (peer == NULL)
-                        return (-ENOMEM);
-        }
+        irq = ksocknal_sock_irq (sock);
 
         PORTAL_ALLOC(conn, sizeof(*conn));
-        if (conn == NULL) {
-                if (peer != NULL)
-                        ksocknal_put_peer (peer);
+        if (conn == NULL)
                 return (-ENOMEM);
-        }
 
         memset (conn, 0, sizeof (*conn));
         conn->ksnc_peer = NULL;
         conn->ksnc_route = NULL;
         conn->ksnc_sock = sock;
         conn->ksnc_type = type;
-        conn->ksnc_incarnation = incarnation;
         conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
         conn->ksnc_saved_write_space = sock->sk->sk_write_space;
         atomic_set (&conn->ksnc_refcount, 1);    /* 1 ref for me */
@@ -745,73 +1140,147 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock,
         conn->ksnc_tx_scheduled = 0;
         atomic_set (&conn->ksnc_tx_nob, 0);
 
-        ksocknal_get_peer_addr (conn);
+        /* stash conn's local and remote addrs */
+        rc = ksocknal_get_conn_addrs (conn);
+        if (rc != 0)
+                goto failed_0;
 
-        irq = ksocknal_conn_irq (conn);
+        if (!passive) {
+                /* Active connection sends HELLO eagerly */
+                rc = ksocknal_local_ipvec(ipaddrs);
+                if (rc < 0)
+                        goto failed_0;
+                nipaddrs = rc;
 
-        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+                rc = ksocknal_send_hello (conn, ipaddrs, nipaddrs);
+                if (rc != 0)
+                        goto failed_0;
+        }
+
+        /* Find out/confirm peer's NID and connection type and get the
+         * vector of interfaces she's willing to let me connect to */
+        nid = (route == NULL) ? PTL_NID_ANY : route->ksnr_peer->ksnp_nid;
+        rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs);
+        if (rc < 0)
+                goto failed_0;
+        nipaddrs = rc;
+        LASSERT (nid != PTL_NID_ANY);
 
         if (route != NULL) {
-                /* Autoconnected! */
-                LASSERT ((route->ksnr_connected & (1 << type)) == 0);
-                LASSERT ((route->ksnr_connecting & (1 << type)) != 0);
-
-                if (route->ksnr_deleted) {
-                        /* This conn was autoconnected, but the autoconnect
-                         * route got deleted while it was being
-                         * established! */
-                        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock,
-                                                 flags);
-                        PORTAL_FREE (conn, sizeof (*conn));
-                        return (-ESTALE);
+                peer = route->ksnr_peer;
+                atomic_inc(&peer->ksnp_refcount);
+        } else {
+                peer = ksocknal_create_peer(nid);
+                if (peer == NULL) {
+                        rc = -ENOMEM;
+                        goto failed_0;
                 }
 
+                write_lock_irqsave(global_lock, flags);
 
-                /* associate conn/route */
-                conn->ksnc_route = route;
-                atomic_inc (&route->ksnr_refcount);
-
-                route->ksnr_connecting &= ~(1 << type);
-                route->ksnr_connected  |= (1 << type);
-                route->ksnr_conn_count++;
-                route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL;
+                peer2 = ksocknal_find_peer_locked(nid);
+                if (peer2 == NULL) {
+                        /* NB this puts an "empty" peer in the peer
+                         * table (which takes my ref) */
+                        list_add_tail(&peer->ksnp_list,
+                                      ksocknal_nid2peerlist(nid));
+                } else  {
+                        ksocknal_put_peer(peer);
+                        peer = peer2;
+                }
+                /* +1 ref for me */
+                atomic_inc(&peer->ksnp_refcount);
 
-                peer = route->ksnr_peer;
+                write_unlock_irqrestore(global_lock, flags);
+        }
+        
+        if (!passive) {
+                ksocknal_create_routes(peer, conn->ksnc_port, 
+                                       ipaddrs, nipaddrs);
+                rc = 0;
         } else {
-                /* Not an autoconnected connection; see if there is an
-                 * existing peer for this NID */
-                peer2 = ksocknal_find_peer_locked (nid);
-                if (peer2 != NULL) {
-                        ksocknal_put_peer (peer);
-                        peer = peer2;
-                } else {
-                        list_add (&peer->ksnp_list,
-                                  ksocknal_nid2peerlist (nid));
-                        /* peer list takes over existing ref */
+                rc = ksocknal_select_ips(peer, ipaddrs, nipaddrs);
+                LASSERT (rc >= 0);
+                rc = ksocknal_send_hello (conn, ipaddrs, rc);
+        }
+        if (rc < 0)
+                goto failed_1;
+        
+        write_lock_irqsave (global_lock, flags);
+
+        if (peer->ksnp_closing ||
+            (route != NULL && route->ksnr_deleted)) {
+                /* route/peer got closed under me */
+                rc = -ESTALE;
+                goto failed_2;
+        }
+
+        /* Refuse to duplicate an existing connection (both sides might
+         * autoconnect at once), unless this is a loopback connection */
+        if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+                list_for_each(tmp, &peer->ksnp_conns) {
+                        conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                        if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+                            conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+                            conn2->ksnc_type != conn->ksnc_type ||
+                            conn2->ksnc_incarnation != incarnation)
+                                continue;
+
+                        CWARN("Not creating duplicate connection to "
+                              "%u.%u.%u.%u type %d\n",
+                              HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type);
+                        rc = -EALREADY;
+                        goto failed_2;
                 }
         }
 
+        /* If the connection created by this route didn't bind to the IP
+         * address the route connected to, the connection/route matching
+         * code below probably isn't going to work. */
+        if (route != NULL &&
+            route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+                CERROR("Route "LPX64" %u.%u.%u.%u connected to %u.%u.%u.%u\n",
+                       peer->ksnp_nid,
+                       HIPQUAD(route->ksnr_ipaddr),
+                       HIPQUAD(conn->ksnc_ipaddr));
+        }
+
+        /* Search for a route corresponding to the new connection and
+         * create an association.  This allows incoming connections created
+         * by routes in my peer to match my own route entries so I don't
+         * continually create duplicate routes. */
+        list_for_each (tmp, &peer->ksnp_routes) {
+                route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+                        continue;
+                
+                ksocknal_associate_route_conn_locked(route, conn);
+                break;
+        }
+
         /* Give conn a ref on sock->file since we're going to return success */
         get_file(sock->file);
 
-        LASSERT (!peer->ksnp_closing);
-
-        conn->ksnc_peer = peer;
-        atomic_inc (&peer->ksnp_refcount);
+        conn->ksnc_peer = peer;                 /* conn takes my ref on peer */
+        conn->ksnc_incarnation = incarnation;
         peer->ksnp_last_alive = jiffies;
         peer->ksnp_error = 0;
 
+        sched = ksocknal_choose_scheduler_locked (irq);
+        sched->kss_nconns++;
+        conn->ksnc_scheduler = sched;
+
         /* Set the deadline for the outgoing HELLO to drain */
+        conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
         conn->ksnc_tx_deadline = jiffies +
-                                 ksocknal_data.ksnd_io_timeout * HZ;
+                                 ksocknal_tunables.ksnd_io_timeout * HZ;
+        mb();       /* order with adding to peer's conn list */
 
         list_add (&conn->ksnc_list, &peer->ksnp_conns);
         atomic_inc (&conn->ksnc_refcount);
 
-        sched = ksocknal_choose_scheduler_locked (irq);
-        sched->kss_nconns++;
-        conn->ksnc_scheduler = sched;
-
         /* NB my callbacks block while I hold ksnd_global_lock */
         sock->sk->sk_user_data = conn;
         sock->sk->sk_data_ready = ksocknal_data_ready;
@@ -819,10 +1288,7 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock,
 
         /* Take all the packets blocking for a connection.
          * NB, it might be nicer to share these blocked packets among any
-         * other connections that are becoming established, however that
-         * confuses the normal packet launching operation, which selects a
-         * connection and queues the packet on it without needing an
-         * exclusive lock on ksnd_global_lock. */
+         * other connections that are becoming established. */
         while (!list_empty (&peer->ksnp_tx_queue)) {
                 tx = list_entry (peer->ksnp_tx_queue.next,
                                  ksock_tx_t, tx_list);
@@ -831,27 +1297,47 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock,
                 ksocknal_queue_tx_locked (tx, conn);
         }
 
-        rc = ksocknal_close_stale_conns_locked (peer, incarnation);
-
-        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
-
+        rc = ksocknal_close_stale_conns_locked(peer, incarnation);
         if (rc != 0)
                 CERROR ("Closed %d stale conns to nid "LPX64" ip %d.%d.%d.%d\n",
                         rc, conn->ksnc_peer->ksnp_nid,
                         HIPQUAD(conn->ksnc_ipaddr));
 
-        if (bind_irq)                           /* irq binding required */
-                ksocknal_bind_irq (irq);
+        write_unlock_irqrestore (global_lock, flags);
+
+        ksocknal_bind_irq (irq);
 
         /* Call the callbacks right now to get things going. */
-        ksocknal_data_ready (sock->sk, 0);
-        ksocknal_write_space (sock->sk);
+        if (ksocknal_getconnsock(conn) == 0) {
+                ksocknal_data_ready (sock->sk, 0);
+                ksocknal_write_space (sock->sk);
+                ksocknal_putconnsock(conn);
+        }
 
-        CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64" ip %d.%d.%d.%d\n",
-               conn, conn->ksnc_peer->ksnp_nid, HIPQUAD(conn->ksnc_ipaddr));
+        CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+              " incarnation:"LPX64" sched[%d]/%d\n",
+              nid, HIPQUAD(conn->ksnc_myipaddr), 
+              HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation,
+              (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
 
         ksocknal_put_conn (conn);
         return (0);
+
+ failed_2:
+        if (!peer->ksnp_closing &&
+            list_empty (&peer->ksnp_conns) &&
+            list_empty (&peer->ksnp_routes))
+                ksocknal_unlink_peer_locked(peer);
+        write_unlock_irqrestore(global_lock, flags);
+
+ failed_1:
+        ksocknal_put_peer (peer);
+
+ failed_0:
+        PORTAL_FREE (conn, sizeof(*conn));
+
+        LASSERT (rc != 0);
+        return (rc);
 }
 
 void
@@ -860,14 +1346,19 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
         /* This just does the immmediate housekeeping, and queues the
          * connection for the reaper to terminate.
          * Caller holds ksnd_global_lock exclusively in irq context */
-        ksock_peer_t   *peer = conn->ksnc_peer;
-        ksock_route_t  *route;
+        ksock_peer_t      *peer = conn->ksnc_peer;
+        ksock_route_t     *route;
+        ksock_conn_t      *conn2;
+        struct list_head  *tmp;
 
         LASSERT (peer->ksnp_error == 0);
         LASSERT (!conn->ksnc_closing);
         conn->ksnc_closing = 1;
         atomic_inc (&ksocknal_data.ksnd_nclosing_conns);
         
+        /* ksnd_deathrow_conns takes over peer's ref */
+        list_del (&conn->ksnc_list);
+
         route = conn->ksnc_route;
         if (route != NULL) {
                 /* dissociate conn from route... */
@@ -875,18 +1366,28 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
                 LASSERT ((route->ksnr_connecting & (1 << conn->ksnc_type)) == 0);
                 LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
 
-                route->ksnr_connected &= ~(1 << conn->ksnc_type);
+                conn2 = NULL;
+                list_for_each(tmp, &peer->ksnp_conns) {
+                        conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+                        
+                        if (conn2->ksnc_route == route &&
+                            conn2->ksnc_type == conn->ksnc_type)
+                                break;
+                        
+                        conn2 = NULL;
+                }
+                if (conn2 == NULL)
+                        route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
                 conn->ksnc_route = NULL;
 
+#if 0           /* irrelevent with only eager routes */
                 list_del (&route->ksnr_list);   /* make route least favourite */
                 list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
-                
+#endif
                 ksocknal_put_route (route);     /* drop conn's ref on route */
         }
 
-        /* ksnd_deathrow_conns takes over peer's ref */
-        list_del (&conn->ksnc_list);
-
         if (list_empty (&peer->ksnp_conns)) {
                 /* No more connections to this peer */
 
@@ -1076,6 +1577,11 @@ ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation)
 
                 if (conn->ksnc_incarnation == incarnation)
                         continue;
+
+                CWARN("Closing stale conn nid:"LPX64" ip:%08x/%d "
+                      "incarnation:"LPX64"("LPX64")\n",
+                      peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port,
+                      conn->ksnc_incarnation, incarnation);
                 
                 count++;
                 ksocknal_close_conn_locked (conn, -ESTALE);
@@ -1296,44 +1802,213 @@ ksocknal_push (ptl_nid_t nid)
 }
 
 int
-ksocknal_cmd(struct portals_cfg *pcfg, void * private)
+ksocknal_add_interface(__u32 ipaddress, __u32 netmask)
+{
+        unsigned long      flags;
+        ksock_interface_t *iface;
+        int                rc;
+        int                i;
+        int                j;
+        struct list_head  *ptmp;
+        ksock_peer_t      *peer;
+        struct list_head  *rtmp;
+        ksock_route_t     *route;
+
+        if (ipaddress == 0 ||
+            netmask == 0)
+                return (-EINVAL);
+
+        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+
+        iface = ksocknal_ip2iface(ipaddress);
+        if (iface != NULL) {
+                /* silently ignore dups */
+                rc = 0;
+        } else if (ksocknal_data.ksnd_ninterfaces == SOCKNAL_MAX_INTERFACES) {
+                rc = -ENOSPC;
+        } else {
+                iface = &ksocknal_data.ksnd_interfaces[ksocknal_data.ksnd_ninterfaces++];
+
+                iface->ksni_ipaddr = ipaddress;
+                iface->ksni_netmask = netmask;
+                iface->ksni_nroutes = 0;
+                iface->ksni_npeers = 0;
+
+                for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                        list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+                                peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+
+                                for (j = 0; i < peer->ksnp_n_passive_ips; j++)
+                                        if (peer->ksnp_passive_ips[j] == ipaddress)
+                                                iface->ksni_npeers++;
+                                
+                                list_for_each(rtmp, &peer->ksnp_routes) {
+                                        route = list_entry(rtmp, ksock_route_t, ksnr_list);
+                                        
+                                        if (route->ksnr_myipaddr == ipaddress)
+                                                iface->ksni_nroutes++;
+                                }
+                        }
+                }
+
+                rc = 0;
+                /* NB only new connections will pay attention to the new interface! */
+        }
+        
+        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+
+        return (rc);
+}
+
+void
+ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+{
+        struct list_head   *tmp;
+        struct list_head   *nxt;
+        ksock_route_t      *route;
+        ksock_conn_t       *conn;
+        int                 i;
+        int                 j;
+
+        for (i = 0; i < peer->ksnp_n_passive_ips; i++)
+                if (peer->ksnp_passive_ips[i] == ipaddr) {
+                        for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
+                                peer->ksnp_passive_ips[j-1] =
+                                        peer->ksnp_passive_ips[j];
+                        peer->ksnp_n_passive_ips--;
+                        break;
+                }
+
+        list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+                route = list_entry (tmp, ksock_route_t, ksnr_list);
+                
+                if (route->ksnr_myipaddr != ipaddr)
+                        continue;
+                
+                if (route->ksnr_share_count != 0) {
+                        /* Manually created; keep, but unbind */
+                        route->ksnr_myipaddr = 0;
+                } else {
+                        ksocknal_del_route_locked(route);
+                }
+        }
+        
+        list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                
+                if (conn->ksnc_myipaddr == ipaddr)
+                        ksocknal_close_conn_locked (conn, 0);
+        }
+}
+
+int
+ksocknal_del_interface(__u32 ipaddress)
 {
-        int rc = -EINVAL;
+        int                rc = -ENOENT;
+        unsigned long      flags;
+        struct list_head  *tmp;
+        struct list_head  *nxt;
+        ksock_peer_t      *peer;
+        __u32              this_ip;
+        int                i;
+        int                j;
+
+        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+
+        for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) {
+                this_ip = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr;
+
+                if (!(ipaddress == 0 ||
+                      ipaddress == this_ip))
+                        continue;
+
+                rc = 0;
+
+                for (j = i+1; j < ksocknal_data.ksnd_ninterfaces; j++)
+                        ksocknal_data.ksnd_interfaces[j-1] =
+                                ksocknal_data.ksnd_interfaces[j];
+                
+                ksocknal_data.ksnd_ninterfaces--;
+
+                for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+                        list_for_each_safe(tmp, nxt, &ksocknal_data.ksnd_peers[j]) {
+                                peer = list_entry(tmp, ksock_peer_t, ksnp_list);
+                                
+                                ksocknal_peer_del_interface_locked(peer, this_ip);
+                        }
+                }
+        }
+        
+        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+        
+        return (rc);
+}
 
-        LASSERT (pcfg != NULL);
+int
+ksocknal_cmd(struct portals_cfg *pcfg, void * private)
+{
+        int rc;
 
         switch(pcfg->pcfg_command) {
-        case NAL_CMD_GET_AUTOCONN: {
-                ksock_route_t *route = ksocknal_get_route_by_idx (pcfg->pcfg_count);
+        case NAL_CMD_GET_INTERFACE: {
+                ksock_interface_t *iface;
+
+                read_lock (&ksocknal_data.ksnd_global_lock);
 
-                if (route == NULL)
+                if (pcfg->pcfg_count < 0 ||
+                    pcfg->pcfg_count >= ksocknal_data.ksnd_ninterfaces) {
                         rc = -ENOENT;
-                else {
+                else {
                         rc = 0;
-                        pcfg->pcfg_nid   = route->ksnr_peer->ksnp_nid;
-                        pcfg->pcfg_id    = route->ksnr_ipaddr;
-                        pcfg->pcfg_misc  = route->ksnr_port;
-                        pcfg->pcfg_count = route->ksnr_conn_count;
-                        pcfg->pcfg_size  = route->ksnr_buffer_size;
-                        pcfg->pcfg_wait  = route->ksnr_sharecount;
-                        pcfg->pcfg_flags = (route->ksnr_irq_affinity ? 2 : 0) |
-                                           (route->ksnr_eager        ? 4 : 0);
-                        ksocknal_put_route (route);
+                        iface = &ksocknal_data.ksnd_interfaces[pcfg->pcfg_count];
+
+                        pcfg->pcfg_id    = iface->ksni_ipaddr;
+                        pcfg->pcfg_misc  = iface->ksni_netmask;
+                        pcfg->pcfg_fd    = iface->ksni_npeers;
+                        pcfg->pcfg_count = iface->ksni_nroutes;
                 }
+                
+                read_unlock (&ksocknal_data.ksnd_global_lock);
+                break;
+        }
+        case NAL_CMD_ADD_INTERFACE: {
+                rc = ksocknal_add_interface(pcfg->pcfg_id, /* IP address */
+                                            pcfg->pcfg_misc); /* net mask */
+                break;
+        }
+        case NAL_CMD_DEL_INTERFACE: {
+                rc = ksocknal_del_interface(pcfg->pcfg_id); /* IP address */
+                break;
+        }
+        case NAL_CMD_GET_PEER: {
+                ptl_nid_t    nid = 0;
+                __u32        myip = 0;
+                __u32        ip = 0;
+                int          port = 0;
+                int          conn_count = 0;
+                int          share_count = 0;
+                
+                rc = ksocknal_get_peer_info(pcfg->pcfg_count, &nid,
+                                            &myip, &ip, &port,
+                                            &conn_count,  &share_count);
+                pcfg->pcfg_nid   = nid;
+                pcfg->pcfg_size  = myip;
+                pcfg->pcfg_id    = ip;
+                pcfg->pcfg_misc  = port;
+                pcfg->pcfg_count = conn_count;
+                pcfg->pcfg_wait  = share_count;
                 break;
         }
-        case NAL_CMD_ADD_AUTOCONN: {
-                rc = ksocknal_add_route (pcfg->pcfg_nid, pcfg->pcfg_id,
-                                         pcfg->pcfg_misc, pcfg->pcfg_size,
-                                         (pcfg->pcfg_flags & 0x02) != 0,
-                                         (pcfg->pcfg_flags & 0x04) != 0,
-                                         (pcfg->pcfg_flags & 0x08) != 0);
+        case NAL_CMD_ADD_PEER: {
+                rc = ksocknal_add_peer (pcfg->pcfg_nid, 
+                                        pcfg->pcfg_id, /* IP */
+                                        pcfg->pcfg_misc); /* port */
                 break;
         }
-        case NAL_CMD_DEL_AUTOCONN: {
-                rc = ksocknal_del_route (pcfg->pcfg_nid, pcfg->pcfg_id, 
-                                         (pcfg->pcfg_flags & 1) != 0,
-                                         (pcfg->pcfg_flags & 2) != 0);
+        case NAL_CMD_DEL_PEER: {
+                rc = ksocknal_del_peer (pcfg->pcfg_nid, 
+                                        pcfg->pcfg_id, /* IP */
+                                        pcfg->pcfg_flags); /* single_share? */
                 break;
         }
         case NAL_CMD_GET_CONN: {
@@ -1342,11 +2017,23 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private)
                 if (conn == NULL)
                         rc = -ENOENT;
                 else {
+                        int   txmem;
+                        int   rxmem;
+                        int   nagle;
+
+                        ksocknal_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
                         rc = 0;
-                        pcfg->pcfg_nid   = conn->ksnc_peer->ksnp_nid;
-                        pcfg->pcfg_id    = conn->ksnc_ipaddr;
-                        pcfg->pcfg_misc  = conn->ksnc_port;
-                        pcfg->pcfg_flags = conn->ksnc_type;
+                        pcfg->pcfg_nid    = conn->ksnc_peer->ksnp_nid;
+                        pcfg->pcfg_id     = conn->ksnc_ipaddr;
+                        pcfg->pcfg_misc   = conn->ksnc_port;
+                        pcfg->pcfg_fd     = conn->ksnc_myipaddr;
+                        pcfg->pcfg_flags  = conn->ksnc_type;
+                        pcfg->pcfg_gw_nal = conn->ksnc_scheduler - 
+                                            ksocknal_data.ksnd_schedulers;
+                        pcfg->pcfg_count  = txmem;
+                        pcfg->pcfg_size   = rxmem;
+                        pcfg->pcfg_wait   = nagle;
                         ksocknal_put_conn (conn);
                 }
                 break;
@@ -1364,12 +2051,13 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private)
                 case SOCKNAL_CONN_CONTROL:
                 case SOCKNAL_CONN_BULK_IN:
                 case SOCKNAL_CONN_BULK_OUT:
-                        rc = ksocknal_create_conn(NULL, sock, pcfg->pcfg_flags, type);
+                        rc = ksocknal_create_conn(NULL, sock, type);
+                        break;
                 default:
+                        rc = -EINVAL;
                         break;
                 }
-                if (rc != 0)
-                        fput (sock->file);
+                fput (sock->file);
                 break;
         }
         case NAL_CMD_CLOSE_CONNECTION: {
@@ -1385,6 +2073,9 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private)
                 rc = ksocknal_push (pcfg->pcfg_nid);
                 break;
         }
+        default:
+                rc = -EINVAL;
+                break;
         }
 
         return rc;
@@ -1424,7 +2115,7 @@ ksocknal_free_buffers (void)
 
         if (ksocknal_data.ksnd_schedulers != NULL)
                 PORTAL_FREE (ksocknal_data.ksnd_schedulers,
-                             sizeof (ksock_sched_t) * SOCKNAL_N_SCHED);
+                             sizeof (ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
 
         PORTAL_FREE (ksocknal_data.ksnd_peers,
                      sizeof (struct list_head) * 
@@ -1432,37 +2123,39 @@ ksocknal_free_buffers (void)
 }
 
 void
-ksocknal_module_fini (void)
+ksocknal_api_shutdown (nal_t *nal)
 {
-        int   i;
+        ksock_sched_t *sched;
+        int            i;
+
+        if (nal->nal_refct != 0) {
+                /* This module got the first ref */
+                PORTAL_MODULE_UNUSE;
+                return;
+        }
 
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
                atomic_read (&portal_kmemory));
 
+        LASSERT(nal == &ksocknal_api);
+
         switch (ksocknal_data.ksnd_init) {
         default:
                 LASSERT (0);
 
         case SOCKNAL_INIT_ALL:
-#if CONFIG_SYSCTL
-                if (ksocknal_data.ksnd_sysctl != NULL)
-                        unregister_sysctl_table (ksocknal_data.ksnd_sysctl);
-#endif
-                kportal_nal_unregister(SOCKNAL);
-                PORTAL_SYMBOL_UNREGISTER (ksocknal_ni);
+                libcfs_nal_cmd_unregister(SOCKNAL);
+
+                ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB;
                 /* fall through */
 
-        case SOCKNAL_INIT_PTL:
+        case SOCKNAL_INIT_LIB:
                 /* No more calls to ksocknal_cmd() to create new
                  * autoroutes/connections since we're being unloaded. */
-                PtlNIFini(ksocknal_ni);
 
-                /* Delete all autoroute entries */
-                ksocknal_del_route(PTL_NID_ANY, 0, 0, 0);
+                /* Delete all peers */
+                ksocknal_del_peer(PTL_NID_ANY, 0, 0);
 
-                /* Delete all connections */
-                ksocknal_close_matching_conns (PTL_NID_ANY, 0);
-                
                 /* Wait for all peer state to clean up */
                 i = 2;
                 while (atomic_read (&ksocknal_data.ksnd_npeers) != 0) {
@@ -1476,11 +2169,11 @@ ksocknal_module_fini (void)
 
                 /* Tell lib we've stopped calling into her. */
                 lib_fini(&ksocknal_lib);
+
+                ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
                 /* fall through */
 
         case SOCKNAL_INIT_DATA:
-                /* Module refcount only gets to zero when all peers
-                 * have been closed so all lists must be empty */
                 LASSERT (atomic_read (&ksocknal_data.ksnd_npeers) == 0);
                 LASSERT (ksocknal_data.ksnd_peers != NULL);
                 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
@@ -1493,7 +2186,7 @@ ksocknal_module_fini (void)
                 LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns));
 
                 if (ksocknal_data.ksnd_schedulers != NULL)
-                        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
                                 ksock_sched_t *kss =
                                         &ksocknal_data.ksnd_schedulers[i];
 
@@ -1510,19 +2203,30 @@ ksocknal_module_fini (void)
                 wake_up_all (&ksocknal_data.ksnd_autoconnectd_waitq);
                 wake_up_all (&ksocknal_data.ksnd_reaper_waitq);
 
-                for (i = 0; i < SOCKNAL_N_SCHED; i++)
-                       wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq);
+                for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
+                        sched = &ksocknal_data.ksnd_schedulers[i];
+                        wake_up_all(&sched->kss_waitq);
+                }
 
-                while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) {
-                        CDEBUG (D_NET, "waitinf for %d threads to terminate\n",
-                                atomic_read (&ksocknal_data.ksnd_nthreads));
+                i = 4;
+                read_lock(&ksocknal_data.ksnd_global_lock);
+                while (ksocknal_data.ksnd_nthreads != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "waiting for %d threads to terminate\n",
+                                ksocknal_data.ksnd_nthreads);
+                        read_unlock(&ksocknal_data.ksnd_global_lock);
                         set_current_state (TASK_UNINTERRUPTIBLE);
                         schedule_timeout (HZ);
+                        read_lock(&ksocknal_data.ksnd_global_lock);
                 }
+                read_unlock(&ksocknal_data.ksnd_global_lock);
 
                 kpr_deregister (&ksocknal_data.ksnd_router);
 
                 ksocknal_free_buffers();
+
+                ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
                 /* fall through */
 
         case SOCKNAL_INIT_NOTHING:
@@ -1537,7 +2241,7 @@ ksocknal_module_fini (void)
 }
 
 
-void __init
+void
 ksocknal_init_incarnation (void)
 {
         struct timeval tv;
@@ -1553,43 +2257,31 @@ ksocknal_init_incarnation (void)
                 (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
 }
 
-int __init
-ksocknal_module_init (void)
+int
+ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+                      ptl_ni_limits_t *requested_limits,
+                      ptl_ni_limits_t *actual_limits)
 {
-        int   pkmem = atomic_read(&portal_kmemory);
-        int   rc;
-        int   i;
-        int   j;
+        ptl_process_id_t  process_id;
+        int               pkmem = atomic_read(&portal_kmemory);
+        int               rc;
+        int               i;
+        int               j;
 
-        /* packet descriptor must fit in a router descriptor's scratchpad */
-        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
-        /* the following must be sizeof(int) for proc_dointvec() */
-        LASSERT(sizeof (ksocknal_data.ksnd_io_timeout) == sizeof (int));
-        LASSERT(sizeof (ksocknal_data.ksnd_eager_ack) == sizeof (int));
-        /* check ksnr_connected/connecting field large enough */
-        LASSERT(SOCKNAL_CONN_NTYPES <= 4);
-        
-        LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+        LASSERT (nal == &ksocknal_api);
 
-        ksocknal_api.forward  = ksocknal_api_forward;
-        ksocknal_api.shutdown = ksocknal_api_shutdown;
-        ksocknal_api.yield    = ksocknal_api_yield;
-        ksocknal_api.validate = NULL;           /* our api validate is a NOOP */
-        ksocknal_api.lock     = ksocknal_api_lock;
-        ksocknal_api.unlock   = ksocknal_api_unlock;
-        ksocknal_api.nal_data = &ksocknal_data;
+        if (nal->nal_refct != 0) {
+                if (actual_limits != NULL)
+                        *actual_limits = ksocknal_lib.libnal_ni.ni_actual_limits;
+                /* This module got the first ref */
+                PORTAL_MODULE_USE;
+                return (PTL_OK);
+        }
 
-        ksocknal_lib.nal_data = &ksocknal_data;
+        LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
 
         memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
 
-        ksocknal_data.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT;
-        ksocknal_data.ksnd_eager_ack  = SOCKNAL_EAGER_ACK;
-        ksocknal_data.ksnd_typed_conns = SOCKNAL_TYPED_CONNS;
-        ksocknal_data.ksnd_min_bulk   = SOCKNAL_MIN_BULK;
-#if SOCKNAL_ZC
-        ksocknal_data.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG;
-#endif
         ksocknal_init_incarnation();
         
         ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
@@ -1603,9 +2295,6 @@ ksocknal_module_init (void)
 
         rwlock_init(&ksocknal_data.ksnd_global_lock);
 
-        ksocknal_data.ksnd_nal_cb = &ksocknal_lib;
-        spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock);
-
         spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock);
         INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs);
         INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns);
@@ -1632,14 +2321,15 @@ ksocknal_module_init (void)
         /* flag lists/ptrs/locks initialised */
         ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
 
+        ksocknal_data.ksnd_nschedulers = ksocknal_nsched();
         PORTAL_ALLOC(ksocknal_data.ksnd_schedulers,
-                     sizeof(ksock_sched_t) * SOCKNAL_N_SCHED);
+                     sizeof(ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
         if (ksocknal_data.ksnd_schedulers == NULL) {
-                ksocknal_module_fini ();
+                ksocknal_api_shutdown (nal);
                 return (-ENOMEM);
         }
 
-        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
                 ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i];
 
                 spin_lock_init (&kss->kss_lock);
@@ -1651,23 +2341,27 @@ ksocknal_module_init (void)
                 init_waitqueue_head (&kss->kss_waitq);
         }
 
-        rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni);
-        if (rc != 0) {
-                CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
-                ksocknal_module_fini ();
+        /* NB we have to wait to be told our true NID... */
+        process_id.pid = requested_pid; 
+        process_id.nid = 0;
+        
+        rc = lib_init(&ksocknal_lib, nal, process_id,
+                      requested_limits, actual_limits);
+        if (rc != PTL_OK) {
+                CERROR("lib_init failed: error %d\n", rc);
+                ksocknal_api_shutdown (nal);
                 return (rc);
         }
-        PtlNIDebug(ksocknal_ni, ~0);
 
-        ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; // flag lib_init() called
 
-        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
                 rc = ksocknal_thread_start (ksocknal_scheduler,
                                             &ksocknal_data.ksnd_schedulers[i]);
                 if (rc != 0) {
                         CERROR("Can't spawn socknal scheduler[%d]: %d\n",
                                i, rc);
-                        ksocknal_module_fini ();
+                        ksocknal_api_shutdown (nal);
                         return (rc);
                 }
         }
@@ -1676,7 +2370,7 @@ ksocknal_module_init (void)
                 rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i));
                 if (rc != 0) {
                         CERROR("Can't spawn socknal autoconnectd: %d\n", rc);
-                        ksocknal_module_fini ();
+                        ksocknal_api_shutdown (nal);
                         return (rc);
                 }
         }
@@ -1684,7 +2378,7 @@ ksocknal_module_init (void)
         rc = ksocknal_thread_start (ksocknal_reaper, NULL);
         if (rc != 0) {
                 CERROR ("Can't spawn socknal reaper: %d\n", rc);
-                ksocknal_module_fini ();
+                ksocknal_api_shutdown (nal);
                 return (rc);
         }
 
@@ -1694,7 +2388,7 @@ ksocknal_module_init (void)
                 CDEBUG(D_NET, "Can't initialise routing interface "
                        "(rc = %d): not routing\n", rc);
         } else {
-                /* Only allocate forwarding buffers if I'm on a gateway */
+                /* Only allocate forwarding buffers if there's a router */
 
                 for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS +
                                  SOCKNAL_LARGE_FWD_NMSGS); i++) {
@@ -1710,7 +2404,7 @@ ksocknal_module_init (void)
                         PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t, 
                                                    fmb_kiov[pool->fmp_buff_pages]));
                         if (fmb == NULL) {
-                                ksocknal_module_fini();
+                                ksocknal_api_shutdown(nal);
                                 return (-ENOMEM);
                         }
 
@@ -1720,7 +2414,7 @@ ksocknal_module_init (void)
                                 fmb->fmb_kiov[j].kiov_page = alloc_page(GFP_KERNEL);
 
                                 if (fmb->fmb_kiov[j].kiov_page == NULL) {
-                                        ksocknal_module_fini ();
+                                        ksocknal_api_shutdown (nal);
                                         return (-ENOMEM);
                                 }
 
@@ -1731,27 +2425,100 @@ ksocknal_module_init (void)
                 }
         }
 
-        rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL);
+        rc = libcfs_nal_cmd_register(SOCKNAL, &ksocknal_cmd, NULL);
         if (rc != 0) {
                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
-                ksocknal_module_fini ();
+                ksocknal_api_shutdown (nal);
                 return (rc);
         }
 
-        PORTAL_SYMBOL_REGISTER(ksocknal_ni);
-
-#ifdef CONFIG_SYSCTL
-        /* Press on regardless even if registering sysctl doesn't work */
-        ksocknal_data.ksnd_sysctl = register_sysctl_table (ksocknal_top_ctl_table, 0);
-#endif
         /* flag everything initialised */
         ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
 
         printk(KERN_INFO "Lustre: Routing socket NAL loaded "
-               "(Routing %s, initial mem %d)\n",
+               "(Routing %s, initial mem %d, incarnation "LPX64")\n",
                kpr_routing (&ksocknal_data.ksnd_router) ?
-               "enabled" : "disabled", pkmem);
+               "enabled" : "disabled", pkmem, ksocknal_data.ksnd_incarnation);
+
+        return (0);
+}
+
+void __exit
+ksocknal_module_fini (void)
+{
+#ifdef CONFIG_SYSCTL
+        if (ksocknal_tunables.ksnd_sysctl != NULL)
+                unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl);
+#endif
+        PtlNIFini(ksocknal_ni);
+
+        ptl_unregister_nal(SOCKNAL);
+}
+
+int __init
+ksocknal_module_init (void)
+{
+        int    rc;
+
+        /* packet descriptor must fit in a router descriptor's scratchpad */
+        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+        /* the following must be sizeof(int) for proc_dointvec() */
+        LASSERT(sizeof (ksocknal_tunables.ksnd_io_timeout) == sizeof (int));
+        LASSERT(sizeof (ksocknal_tunables.ksnd_eager_ack) == sizeof (int));
+        LASSERT(sizeof (ksocknal_tunables.ksnd_typed_conns) == sizeof (int));
+        LASSERT(sizeof (ksocknal_tunables.ksnd_min_bulk) == sizeof (int));
+        LASSERT(sizeof (ksocknal_tunables.ksnd_buffer_size) == sizeof (int));
+        LASSERT(sizeof (ksocknal_tunables.ksnd_nagle) == sizeof (int));
+        LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_idle) == sizeof (int));
+        LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_count) == sizeof (int));
+        LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_intvl) == sizeof (int));
+#if CPU_AFFINITY
+        LASSERT(sizeof (ksocknal_tunables.ksnd_irq_affinity) == sizeof (int));
+#endif
+#if SOCKNAL_ZC
+        LASSERT(sizeof (ksocknal_tunables.ksnd_zc_min_frag) == sizeof (int));
+#endif
+        /* check ksnr_connected/connecting field large enough */
+        LASSERT(SOCKNAL_CONN_NTYPES <= 4);
+        
+        ksocknal_api.nal_ni_init = ksocknal_api_startup;
+        ksocknal_api.nal_ni_fini = ksocknal_api_shutdown;
+
+        /* Initialise dynamic tunables to defaults once only */
+        ksocknal_tunables.ksnd_io_timeout      = SOCKNAL_IO_TIMEOUT;
+        ksocknal_tunables.ksnd_eager_ack       = SOCKNAL_EAGER_ACK;
+        ksocknal_tunables.ksnd_typed_conns     = SOCKNAL_TYPED_CONNS;
+        ksocknal_tunables.ksnd_min_bulk        = SOCKNAL_MIN_BULK;
+        ksocknal_tunables.ksnd_buffer_size     = SOCKNAL_BUFFER_SIZE;
+        ksocknal_tunables.ksnd_nagle           = SOCKNAL_NAGLE;
+        ksocknal_tunables.ksnd_keepalive_idle  = SOCKNAL_KEEPALIVE_IDLE;
+        ksocknal_tunables.ksnd_keepalive_count = SOCKNAL_KEEPALIVE_COUNT;
+        ksocknal_tunables.ksnd_keepalive_intvl = SOCKNAL_KEEPALIVE_INTVL;
+#if CPU_AFFINITY
+        ksocknal_tunables.ksnd_irq_affinity = SOCKNAL_IRQ_AFFINITY;
+#endif
+#if SOCKNAL_ZC
+        ksocknal_tunables.ksnd_zc_min_frag  = SOCKNAL_ZC_MIN_FRAG;
+#endif
+
+        rc = ptl_register_nal(SOCKNAL, &ksocknal_api);
+        if (rc != PTL_OK) {
+                CERROR("Can't register SOCKNAL: %d\n", rc);
+                return (-ENOMEM);               /* or something... */
+        }
 
+        /* Pure gateways want the NAL started up at module load time... */
+        rc = PtlNIInit(SOCKNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &ksocknal_ni);
+        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+                ptl_unregister_nal(SOCKNAL);
+                return (-ENODEV);
+        }
+        
+#ifdef CONFIG_SYSCTL
+        /* Press on regardless even if registering sysctl doesn't work */
+        ksocknal_tunables.ksnd_sysctl = 
+                register_sysctl_table (ksocknal_top_ctl_table, 0);
+#endif
         return (0);
 }
 
@@ -1762,4 +2529,3 @@ MODULE_LICENSE("GPL");
 module_init(ksocknal_module_init);
 module_exit(ksocknal_module_fini);
 
-EXPORT_SYMBOL (ksocknal_ni);
index 2bef800..b8bbefd 100644 (file)
@@ -67,7 +67,6 @@
 #include <portals/nal.h>
 #include <portals/socknal.h>
 
-#define SOCKNAL_N_SCHED         ksocknal_nsched() /* # socknal schedulers */
 #define SOCKNAL_N_AUTOCONNECTD  4               /* # socknal autoconnect daemons */
 
 #define SOCKNAL_MIN_RECONNECT_INTERVAL HZ      /* first failed connection retry... */
 #define SOCKNAL_TYPED_CONNS      1              /* unidirectional large, bidirectional small? */
 #define SOCKNAL_ZC_MIN_FRAG     (2<<10)         /* default smallest zerocopy fragment */
 #define SOCKNAL_MIN_BULK        (1<<10)         /* smallest "large" message */
-#define SOCKNAL_USE_KEEPALIVES   0              /* use tcp/ip keepalive? */
+#define SOCKNAL_BUFFER_SIZE     (8<<20)         /* default socket buffer size */
+#define SOCKNAL_NAGLE            0              /* enable/disable NAGLE? */
+#define SOCKNAL_IRQ_AFFINITY     1              /* enable/disable IRQ affinity? */
+#define SOCKNAL_KEEPALIVE_IDLE   0              /* # seconds idle before 1st probe */
+#define SOCKNAL_KEEPALIVE_COUNT  10             /* # unanswered probes to determine peer death */
+#define SOCKNAL_KEEPALIVE_INTVL  1              /* seconds between probes */
 
 #define SOCKNAL_PEER_HASH_SIZE   101            /* # peer lists */
 
 #define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
 #define SOCKNAL_ENOMEM_RETRY    1               /* jiffies between retries */
 
+#define SOCKNAL_MAX_INTERFACES  16              /* Largest number of interfaces we bind */
+
+#define SOCKNAL_ROUND_ROBIN     0               /* round robin / load balance */
+
 #define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10)
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72))
+# define sk_allocation  allocation
 # define sk_data_ready data_ready
 # define sk_write_space write_space
 # define sk_user_data   user_data
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
 # define sk_wmem_queued wmem_queued
+# define sk_err         err
 #endif
 
 typedef struct                                  /* pool of forwarding buffers */
@@ -131,34 +141,52 @@ typedef struct                                  /* per scheduler state */
         int               kss_nconns;           /* # connections assigned to this scheduler */
 } ksock_sched_t;
 
-typedef struct {
+typedef struct
+{
         int               ksni_valid:1;         /* been set yet? */
         int               ksni_bound:1;         /* bound to a cpu yet? */
         int               ksni_sched:6;         /* which scheduler (assumes < 64) */
 } ksock_irqinfo_t;
 
-typedef struct {
-        int               ksnd_init;            /* initialisation state */
+typedef struct
+{
+        __u32             ksni_ipaddr;          /* interface's IP address */
+        __u32             ksni_netmask;         /* interface's network mask */
+        int               ksni_nroutes;         /* # routes using (active) */
+        int               ksni_npeers;          /* # peers using (passive) */
+} ksock_interface_t;
+
+typedef struct
+{
         int               ksnd_io_timeout;      /* "stuck" socket timeout (seconds) */
         int               ksnd_eager_ack;       /* make TCP ack eagerly? */
         int               ksnd_typed_conns;     /* drive sockets by type? */
         int               ksnd_min_bulk;        /* smallest "large" message */
+        int               ksnd_buffer_size;     /* socket buffer size */
+        int               ksnd_nagle;           /* enable NAGLE? */
+        int               ksnd_irq_affinity;    /* enable IRQ affinity? */
+        int               ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+        int               ksnd_keepalive_count; /* # probes */
+        int               ksnd_keepalive_intvl; /* time between probes */
 #if SOCKNAL_ZC
         unsigned int      ksnd_zc_min_frag;     /* minimum zero copy frag size */
 #endif
         struct ctl_table_header *ksnd_sysctl;   /* sysctl interface */
+} ksock_tunables_t;
+
+typedef struct
+{
+        int               ksnd_init;            /* initialisation state */
         __u64             ksnd_incarnation;     /* my epoch */
         
         rwlock_t          ksnd_global_lock;     /* stabilize peer/conn ops */
         struct list_head *ksnd_peers;           /* hash table of all my known peers */
         int               ksnd_peer_hash_size;  /* size of ksnd_peers */
 
-        nal_cb_t         *ksnd_nal_cb;
-        spinlock_t        ksnd_nal_cb_lock;     /* lib cli/sti lock */
-
-        atomic_t          ksnd_nthreads;        /* # live threads */
+        int               ksnd_nthreads;        /* # live threads */
         int               ksnd_shuttingdown;    /* tell threads to exit */
-        ksock_sched_t    *ksnd_schedulers;      /* scheduler state */
+        int               ksnd_nschedulers;     /* # schedulers */
+        ksock_sched_t    *ksnd_schedulers;      /* their state */
 
         atomic_t          ksnd_npeers;          /* total # peers extant */
         atomic_t          ksnd_nclosing_conns;  /* # closed conns extant */
@@ -186,11 +214,14 @@ typedef struct {
         spinlock_t        ksnd_autoconnectd_lock; /* serialise */
 
         ksock_irqinfo_t   ksnd_irqinfo[NR_IRQS];/* irq->scheduler lookup */
+
+        int               ksnd_ninterfaces;
+        ksock_interface_t ksnd_interfaces[SOCKNAL_MAX_INTERFACES]; /* published interfaces */
 } ksock_nal_data_t;
 
 #define SOCKNAL_INIT_NOTHING    0
 #define SOCKNAL_INIT_DATA       1
-#define SOCKNAL_INIT_PTL        2
+#define SOCKNAL_INIT_LIB        2
 #define SOCKNAL_INIT_ALL        3
 
 /* A packet just assembled for transmission is represented by 1 or more
@@ -286,6 +317,7 @@ typedef struct ksock_conn
         void               *ksnc_saved_write_space; /* socket's original write_space() callback */
         atomic_t            ksnc_refcount;      /* # users */
         ksock_sched_t     *ksnc_scheduler;     /* who schedules this connection */
+        __u32               ksnc_myipaddr;      /* my IP */
         __u32               ksnc_ipaddr;        /* peer's IP */
         int                 ksnc_port;          /* peer's port */
         int                 ksnc_closing;       /* being shut down */
@@ -313,6 +345,7 @@ typedef struct ksock_conn
         struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
         struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
         unsigned long       ksnc_tx_deadline;   /* when (in jiffies) tx times out */
+        int                 ksnc_tx_bufnob;     /* send buffer marker */
         atomic_t            ksnc_tx_nob;        /* # bytes queued */
         int                 ksnc_tx_ready;      /* write space */
         int                 ksnc_tx_scheduled;  /* being progressed */
@@ -328,17 +361,15 @@ typedef struct ksock_route
         struct list_head    ksnr_connect_list;  /* chain on autoconnect list */
         struct ksock_peer  *ksnr_peer;          /* owning peer */
         atomic_t            ksnr_refcount;      /* # users */
-        int                 ksnr_sharecount;    /* lconf usage counter */
         unsigned long       ksnr_timeout;       /* when (in jiffies) reconnection can happen next */
         unsigned int        ksnr_retry_interval; /* how long between retries */
-        __u32               ksnr_ipaddr;        /* an IP address for this peer */
+        __u32               ksnr_myipaddr;      /* my IP */
+        __u32               ksnr_ipaddr;        /* IP address to connect to */
         int                 ksnr_port;          /* port to connect to */
-        int                 ksnr_buffer_size;   /* size of socket buffers */
-        unsigned int        ksnr_irq_affinity:1; /* set affinity? */
-        unsigned int        ksnr_eager:1;       /* connect eagery? */
         unsigned int        ksnr_connecting:4;  /* autoconnects in progress by type */
         unsigned int        ksnr_connected:4;   /* connections established by type */
         unsigned int        ksnr_deleted:1;     /* been removed from peer? */
+        unsigned int        ksnr_share_count;   /* created explicitly? */
         int                 ksnr_conn_count;    /* # conns established by this route */
 } ksock_route_t;
 
@@ -347,31 +378,35 @@ typedef struct ksock_peer
         struct list_head    ksnp_list;          /* stash on global peer list */
         ptl_nid_t           ksnp_nid;           /* who's on the other end(s) */
         atomic_t            ksnp_refcount;      /* # users */
+        int                 ksnp_sharecount;    /* lconf usage counter */
         int                 ksnp_closing;       /* being closed */
         int                 ksnp_error;         /* errno on closing last conn */
         struct list_head    ksnp_conns;         /* all active connections */
         struct list_head    ksnp_routes;        /* routes */
         struct list_head    ksnp_tx_queue;      /* waiting packets */
         unsigned long       ksnp_last_alive;    /* when (in jiffies) I was last alive */
+        int                 ksnp_n_passive_ips; /* # of... */
+        __u32               ksnp_passive_ips[SOCKNAL_MAX_INTERFACES]; /* preferred local interfaces */
 } ksock_peer_t;
 
 
-extern nal_cb_t         ksocknal_lib;
+extern lib_nal_t        ksocknal_lib;
 extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
 
 static inline struct list_head *
-ksocknal_nid2peerlist (ptl_nid_t nid) 
+ksocknal_nid2peerlist (ptl_nid_t nid)
 {
         unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
-        
+
         return (&ksocknal_data.ksnd_peers [hash]);
 }
 
 static inline int
-ksocknal_getconnsock (ksock_conn_t *conn) 
+ksocknal_getconnsock (ksock_conn_t *conn)
 {
         int   rc = -ESHUTDOWN;
-        
+
         read_lock (&ksocknal_data.ksnd_global_lock);
         if (!conn->ksnc_closing) {
                 rc = 0;
@@ -389,7 +424,7 @@ ksocknal_putconnsock (ksock_conn_t *conn)
 }
 
 #ifndef CONFIG_SMP
-static inline 
+static inline
 int ksocknal_nsched(void)
 {
         return 1;
@@ -414,7 +449,7 @@ ksocknal_irqsched2cpu(int i)
 {
         return i;
 }
-# else 
+# else
 static inline int
 ksocknal_nsched(void)
 {
@@ -431,16 +466,13 @@ ksocknal_sched2cpu(int i)
 {
         if (smp_num_siblings == 1)
                 return i;
-        
+
         return (i * 2);
 }
 
 static inline int
 ksocknal_irqsched2cpu(int i)
 {
-        if (smp_num_siblings == 1)
-                return ksocknal_sched2cpu(i);
-
         return (ksocknal_sched2cpu(i) + 1);
 }
 # endif
@@ -453,7 +485,7 @@ extern ksock_peer_t *ksocknal_get_peer (ptl_nid_t nid);
 extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr,
                                int single, int keep_conn);
 extern int ksocknal_create_conn (ksock_route_t *route,
-                                 struct socket *sock, int bind_irq, int type);
+                                 struct socket *sock, int type);
 extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
 extern void ksocknal_terminate_conn (ksock_conn_t *conn);
 extern void ksocknal_destroy_conn (ksock_conn_t *conn);
@@ -474,6 +506,9 @@ extern void ksocknal_data_ready(struct sock *sk, int n);
 extern void ksocknal_write_space(struct sock *sk);
 extern int ksocknal_autoconnectd (void *arg);
 extern int ksocknal_reaper (void *arg);
+extern int ksocknal_get_conn_tunables (ksock_conn_t *conn, int *txmem, 
+                                       int *rxmem, int *nagle);
 extern int ksocknal_setup_sock (struct socket *sock);
-extern int ksocknal_hello (struct socket *sock, 
-                           ptl_nid_t *nid, int *type, __u64 *incarnation);
+extern int ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs);
+extern int ksocknal_recv_hello (ksock_conn_t *conn,
+                                ptl_nid_t *nid, __u64 *incarnation, __u32 *ipaddrs);
index f6ac855..762133e 100644 (file)
  *  LIB functions follow
  *
  */
-ptl_err_t
-ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
-              user_ptr src_addr, size_t len)
-{
-        CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n",
-               nal->ni.nid, (long)len, src_addr, dst_addr);
-
-        memcpy( dst_addr, src_addr, len );
-        return PTL_OK;
-}
-
-ptl_err_t
-ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
-               void *src_addr, size_t len)
-{
-        CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n",
-               nal->ni.nid, (long)len, src_addr, dst_addr);
-
-        memcpy( dst_addr, src_addr, len );
-        return PTL_OK;
-}
-
-void *
-ksocknal_malloc(nal_cb_t *nal, size_t len)
-{
-        void *buf;
-
-        PORTAL_ALLOC(buf, len);
-
-        if (buf != NULL)
-                memset(buf, 0, len);
-
-        return (buf);
-}
-
-void
-ksocknal_free(nal_cb_t *nal, void *buf, size_t len)
-{
-        PORTAL_FREE(buf, len);
-}
-
-void
-ksocknal_printf(nal_cb_t *nal, const char *fmt, ...)
-{
-        va_list ap;
-        char msg[256];
-
-        va_start (ap, fmt);
-        vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */
-        va_end (ap);
-
-        msg[sizeof (msg) - 1] = 0;              /* ensure terminated */
-
-        CDEBUG (D_NET, "%s", msg);
-}
-
-void
-ksocknal_cli(nal_cb_t *nal, unsigned long *flags)
-{
-        ksock_nal_data_t *data = nal->nal_data;
-
-        /* OK to ignore 'flags'; we're only ever serialise threads and
-         * never need to lock out interrupts */
-        spin_lock(&data->ksnd_nal_cb_lock);
-}
-
-void
-ksocknal_sti(nal_cb_t *nal, unsigned long *flags)
-{
-        ksock_nal_data_t *data;
-        data = nal->nal_data;
-
-        spin_unlock(&data->ksnd_nal_cb_lock);
-}
-
 int
-ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+ksocknal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
 {
         /* I would guess that if ksocknal_get_peer (nid) == NULL,
            and we're not routing, then 'nid' is very distant :) */
-        if ( nal->ni.nid == nid ) {
+        if (nal->libnal_ni.ni_pid.nid == nid) {
                 *dist = 0;
         } else {
                 *dist = 1;
@@ -251,7 +176,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
         LASSERT (tx->tx_nkiov > 0);
 
 #if SOCKNAL_ZC
-        if (fragsize >= ksocknal_data.ksnd_zc_min_frag &&
+        if (fragsize >= ksocknal_tunables.ksnd_zc_min_frag &&
             (sock->sk->route_caps & NETIF_F_SG) &&
             (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) {
 
@@ -304,6 +229,7 @@ int
 ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
 {
         int      rc;
+        int      bufnob;
         
         if (ksocknal_data.ksnd_stall_tx != 0) {
                 set_current_state (TASK_UNINTERRUPTIBLE);
@@ -329,6 +255,20 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                         rc = ksocknal_send_kiov (conn, tx);
                 }
 
+                bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+                if (rc > 0)                     /* sent something? */
+                        conn->ksnc_tx_bufnob += rc; /* account it */
+                
+                if (bufnob < conn->ksnc_tx_bufnob) {
+                        /* allocated send buffer bytes < computed; infer
+                         * something got ACKed */
+                        conn->ksnc_tx_deadline = jiffies + 
+                                                 ksocknal_tunables.ksnd_io_timeout * HZ;
+                        conn->ksnc_peer->ksnp_last_alive = jiffies;
+                        conn->ksnc_tx_bufnob = bufnob;
+                        mb();
+                }
+
                 if (rc <= 0) {
                         /* Didn't write anything.
                          *
@@ -361,18 +301,10 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                         break;
                 }
 
+                /* socket's wmem_queued now includes 'rc' bytes */
+                atomic_sub (rc, &conn->ksnc_tx_nob);
                 rc = 0;
 
-                /* Consider the connection alive since we managed to chuck
-                 * more data into it.  Really, we'd like to consider it
-                 * alive only when the peer ACKs something, but
-                 * write_space() only gets called back while SOCK_NOSPACE
-                 * is set.  Instead, we presume peer death has occurred if
-                 * the socket doesn't drain within a timout */
-                conn->ksnc_tx_deadline = jiffies + 
-                                         ksocknal_data.ksnd_io_timeout * HZ;
-                conn->ksnc_peer->ksnp_last_alive = jiffies;
-
         } while (tx->tx_resid != 0);
 
         ksocknal_putconnsock (conn);
@@ -433,7 +365,7 @@ ksocknal_recv_iov (ksock_conn_t *conn)
         /* received something... */
         conn->ksnc_peer->ksnp_last_alive = jiffies;
         conn->ksnc_rx_deadline = jiffies + 
-                                 ksocknal_data.ksnd_io_timeout * HZ;
+                                 ksocknal_tunables.ksnd_io_timeout * HZ;
         mb();                           /* order with setting rx_started */
         conn->ksnc_rx_started = 1;
 
@@ -492,7 +424,7 @@ ksocknal_recv_kiov (ksock_conn_t *conn)
         /* received something... */
         conn->ksnc_peer->ksnp_last_alive = jiffies;
         conn->ksnc_rx_deadline = jiffies + 
-                                 ksocknal_data.ksnd_io_timeout * HZ;
+                                 ksocknal_tunables.ksnd_io_timeout * HZ;
         mb();                           /* order with setting rx_started */
         conn->ksnc_rx_started = 1;
 
@@ -551,7 +483,7 @@ ksocknal_receive (ksock_conn_t *conn)
 
                 if (conn->ksnc_rx_nob_wanted == 0) {
                         /* Completed a message segment (header or payload) */
-                        if ((ksocknal_data.ksnd_eager_ack & conn->ksnc_type) != 0 &&
+                        if ((ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0 &&
                             (conn->ksnc_rx_state ==  SOCKNAL_RX_BODY ||
                              conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) {
                                 /* Remind the socket to ack eagerly... */
@@ -594,8 +526,6 @@ ksocknal_tx_done (ksock_tx_t *tx, int asynch)
         ENTRY;
 
         if (tx->tx_conn != NULL) {
-                /* This tx got queued on a conn; do the accounting... */
-                atomic_sub (tx->tx_nob, &tx->tx_conn->ksnc_tx_nob);
 #if SOCKNAL_ZC
                 /* zero copy completion isn't always from
                  * process_transmit() so it needs to keep a ref on
@@ -710,9 +640,9 @@ ksocknal_launch_autoconnect_locked (ksock_route_t *route)
         LASSERT (!route->ksnr_deleted);
         LASSERT ((route->ksnr_connected & (1 << SOCKNAL_CONN_ANY)) == 0);
         LASSERT ((route->ksnr_connected & KSNR_TYPED_ROUTES) != KSNR_TYPED_ROUTES);
-        LASSERT (!route->ksnr_connecting);
+        LASSERT (route->ksnr_connecting == 0);
         
-        if (ksocknal_data.ksnd_typed_conns)
+        if (ksocknal_tunables.ksnd_typed_conns)
                 route->ksnr_connecting = 
                         KSNR_TYPED_ROUTES & ~route->ksnr_connected;
         else
@@ -772,13 +702,16 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer)
         int               tnob  = 0;
         ksock_conn_t     *fallback = NULL;
         int               fnob     = 0;
+        ksock_conn_t     *conn;
 
-        /* Find the conn with the shortest tx queue */
         list_for_each (tmp, &peer->ksnp_conns) {
                 ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list);
+#if SOCKNAL_ROUND_ROBIN
+                const int     nob = 0;
+#else
                 int           nob = atomic_read(&c->ksnc_tx_nob) +
                                         c->ksnc_sock->sk->sk_wmem_queued;
-
+#endif
                 LASSERT (!c->ksnc_closing);
 
                 if (fallback == NULL || nob < fnob) {
@@ -786,7 +719,7 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer)
                         fnob     = nob;
                 }
 
-                if (!ksocknal_data.ksnd_typed_conns)
+                if (!ksocknal_tunables.ksnd_typed_conns)
                         continue;
 
                 switch (c->ksnc_type) {
@@ -797,11 +730,11 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer)
                 case SOCKNAL_CONN_BULK_IN:
                         continue;
                 case SOCKNAL_CONN_BULK_OUT:
-                        if (tx->tx_nob < ksocknal_data.ksnd_min_bulk)
+                        if (tx->tx_nob < ksocknal_tunables.ksnd_min_bulk)
                                 continue;
                         break;
                 case SOCKNAL_CONN_CONTROL:
-                        if (tx->tx_nob >= ksocknal_data.ksnd_min_bulk)
+                        if (tx->tx_nob >= ksocknal_tunables.ksnd_min_bulk)
                                 continue;
                         break;
                 }
@@ -813,7 +746,16 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer)
         }
 
         /* prefer the typed selection */
-        return ((typed != NULL) ? typed : fallback);
+        conn = (typed != NULL) ? typed : fallback;
+
+#if SOCKNAL_ROUND_ROBIN
+        if (conn != NULL) {
+                /* round-robin all else being equal */
+                list_del (&conn->ksnc_list);
+                list_add_tail (&conn->ksnc_list, &peer->ksnp_conns);
+        }
+#endif
+        return conn;
 }
 
 void
@@ -844,9 +786,14 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
 #endif
         spin_lock_irqsave (&sched->kss_lock, flags);
 
-        conn->ksnc_tx_deadline = jiffies + 
-                                 ksocknal_data.ksnd_io_timeout * HZ;
-        mb();                                   /* order with list_add_tail */
+        if (list_empty(&conn->ksnc_tx_queue) &&
+            conn->ksnc_sock->sk->sk_wmem_queued == 0) {
+                /* First packet starts the timeout */
+                conn->ksnc_tx_deadline = jiffies +
+                                         ksocknal_tunables.ksnd_io_timeout * HZ;
+                conn->ksnc_tx_bufnob = 0;
+                mb();    /* order with adding to tx_queue */
+        }
 
         list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
                 
@@ -868,42 +815,32 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
 {
         struct list_head  *tmp;
         ksock_route_t     *route;
-        ksock_route_t     *candidate = NULL;
-        int                found = 0;
         int                bits;
         
         list_for_each (tmp, &peer->ksnp_routes) {
                 route = list_entry (tmp, ksock_route_t, ksnr_list);
                 bits  = route->ksnr_connected;
-                
-                if ((bits & KSNR_TYPED_ROUTES) == KSNR_TYPED_ROUTES ||
-                    (bits & (1 << SOCKNAL_CONN_ANY)) != 0 ||
-                    route->ksnr_connecting != 0) {
-                        /* All typed connections have been established, or
-                         * an untyped connection has been established, or
-                         * connections are currently being established */
-                        found = 1;
+
+                /* All typed connections established? */
+                if ((bits & KSNR_TYPED_ROUTES) == KSNR_TYPED_ROUTES)
+                        continue;
+
+                /* Untyped connection established? */
+                if ((bits & (1 << SOCKNAL_CONN_ANY)) != 0)
+                        continue;
+
+                /* connection being established? */
+                if (route->ksnr_connecting != 0)
                         continue;
-                }
 
                 /* too soon to retry this guy? */
                 if (!time_after_eq (jiffies, route->ksnr_timeout))
                         continue;
                 
-                /* always do eager routes */
-                if (route->ksnr_eager)
-                        return (route);
-
-                if (candidate == NULL) {
-                        /* If we don't find any other route that is fully
-                         * connected or connecting, the first connectable
-                         * route is returned.  If it fails to connect, it
-                         * will get placed at the end of the list */
-                        candidate = route;
-                }
+                return (route);
         }
-        return (found ? NULL : candidate);
+        
+        return (NULL);
 }
 
 ksock_route_t *
@@ -951,8 +888,9 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid)
         tx->tx_hdr = (ptl_hdr_t *)tx->tx_iov[0].iov_base;
 
         g_lock = &ksocknal_data.ksnd_global_lock;
+#if !SOCKNAL_ROUND_ROBIN
         read_lock (g_lock);
-        
+
         peer = ksocknal_find_target_peer_locked (tx, nid);
         if (peer == NULL) {
                 read_unlock (g_lock);
@@ -969,19 +907,17 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid)
                         return (0);
                 }
         }
-        
-        /* Making one or more connections; I'll need a write lock... */
-
-        atomic_inc (&peer->ksnp_refcount);      /* +1 ref for me while I unlock */
+        /* I'll need a write lock... */
         read_unlock (g_lock);
-        write_lock_irqsave (g_lock, flags);
-        
-        if (peer->ksnp_closing) {               /* peer deleted as I blocked! */
-                write_unlock_irqrestore (g_lock, flags);
-                ksocknal_put_peer (peer);
+#endif
+        write_lock_irqsave(g_lock, flags);
+
+        peer = ksocknal_find_target_peer_locked (tx, nid);
+        if (peer == NULL) {
+                write_unlock_irqrestore(g_lock, flags);
                 return (-EHOSTUNREACH);
         }
-        ksocknal_put_peer (peer);               /* drop ref I got above */
 
         for (;;) {
                 /* launch any/all autoconnections that need it */
@@ -1014,7 +950,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid)
 }
 
 ptl_err_t
-ksocknal_sendmsg(nal_cb_t     *nal, 
+ksocknal_sendmsg(lib_nal_t     *nal, 
                  void         *private, 
                  lib_msg_t    *cookie,
                  ptl_hdr_t    *hdr, 
@@ -1063,7 +999,7 @@ ksocknal_sendmsg(nal_cb_t     *nal,
         if (ltx == NULL) {
                 CERROR("Can't allocate tx desc type %d size %d %s\n",
                        type, desc_size, in_interrupt() ? "(intr)" : "");
-                return (PTL_NOSPACE);
+                return (PTL_NO_SPACE);
         }
 
         atomic_inc(&ksocknal_data.ksnd_nactive_ltxs);
@@ -1111,7 +1047,7 @@ ksocknal_sendmsg(nal_cb_t     *nal,
 }
 
 ptl_err_t
-ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie,
+ksocknal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                unsigned int payload_niov, struct iovec *payload_iov,
                size_t payload_offset, size_t payload_len)
@@ -1123,7 +1059,7 @@ ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie,
 }
 
 ptl_err_t
-ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+ksocknal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
                      size_t payload_offset, size_t payload_len)
@@ -1145,7 +1081,7 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
                 fwd->kprfd_gateway_nid, fwd->kprfd_target_nid);
 
         /* I'm the gateway; must be the last hop */
-        if (nid == ksocknal_lib.ni.nid)
+        if (nid == ksocknal_lib.libnal_ni.ni_pid.nid)
                 nid = fwd->kprfd_target_nid;
 
         /* setup iov for hdr */
@@ -1167,19 +1103,26 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
 int
 ksocknal_thread_start (int (*fn)(void *arg), void *arg)
 {
-        long    pid = kernel_thread (fn, arg, 0);
+        long          pid = kernel_thread (fn, arg, 0);
+        unsigned long flags;
 
         if (pid < 0)
                 return ((int)pid);
 
-        atomic_inc (&ksocknal_data.ksnd_nthreads);
+        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+        ksocknal_data.ksnd_nthreads++;
+        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
         return (0);
 }
 
 void
 ksocknal_thread_fini (void)
 {
-        atomic_dec (&ksocknal_data.ksnd_nthreads);
+        unsigned long flags;
+
+        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+        ksocknal_data.ksnd_nthreads--;
+        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
 }
 
 void
@@ -1197,14 +1140,14 @@ ksocknal_fmb_callback (void *arg, int error)
         if (error != 0)
                 CERROR("Failed to route packet from "
                        LPX64" %s to "LPX64" %s: %d\n",
-                       NTOH__u64(hdr->src_nid),
-                       portals_nid2str(SOCKNAL, NTOH__u64(hdr->src_nid), ipbuf),
-                       NTOH__u64(hdr->dest_nid),
-                       portals_nid2str(SOCKNAL, NTOH__u64(hdr->dest_nid), ipbuf2),
+                       le64_to_cpu(hdr->src_nid),
+                       portals_nid2str(SOCKNAL, le64_to_cpu(hdr->src_nid), ipbuf),
+                       le64_to_cpu(hdr->dest_nid),
+                       portals_nid2str(SOCKNAL, le64_to_cpu(hdr->dest_nid), ipbuf2),
                        error);
         else
                 CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n",
-                        NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid));
+                        le64_to_cpu(hdr->src_nid), le64_to_cpu(hdr->dest_nid));
 
         /* drop peer ref taken on init */
         ksocknal_put_peer (fmb->fmb_peer);
@@ -1284,7 +1227,7 @@ int
 ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
 {
         int       payload_nob = conn->ksnc_rx_nob_left;
-        ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid);
+        ptl_nid_t dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid);
         int       niov = 0;
         int       nob = payload_nob;
 
@@ -1321,7 +1264,7 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
 
         if (payload_nob == 0) {         /* got complete packet already */
                 CDEBUG (D_NET, "%p "LPX64"->"LPX64" fwd_start (immediate)\n",
-                        conn, NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid);
+                        conn, le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid);
 
                 kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd);
 
@@ -1342,7 +1285,7 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
         memcpy(conn->ksnc_rx_kiov, fmb->fmb_kiov, niov * sizeof(ptl_kiov_t));
         
         CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn,
-                NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob);
+                le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid, payload_nob);
         return (0);
 }
 
@@ -1350,9 +1293,9 @@ void
 ksocknal_fwd_parse (ksock_conn_t *conn)
 {
         ksock_peer_t *peer;
-        ptl_nid_t     dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid);
-        ptl_nid_t     src_nid = NTOH__u64 (conn->ksnc_hdr.src_nid);
-        int           body_len = NTOH__u32 (conn->ksnc_hdr.payload_length);
+        ptl_nid_t     dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid);
+        ptl_nid_t     src_nid = le64_to_cpu(conn->ksnc_hdr.src_nid);
+        int           body_len = le32_to_cpu(conn->ksnc_hdr.payload_length);
         char str[PTL_NALFMT_SIZE];
         char str2[PTL_NALFMT_SIZE];
 
@@ -1529,8 +1472,9 @@ ksocknal_process_receive (ksock_conn_t *conn)
         
         switch (conn->ksnc_rx_state) {
         case SOCKNAL_RX_HEADER:
-                if (conn->ksnc_hdr.type != HTON__u32(PTL_MSG_HELLO) &&
-                    NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) {
+                if (conn->ksnc_hdr.type != cpu_to_le32(PTL_MSG_HELLO) &&
+                    le64_to_cpu(conn->ksnc_hdr.dest_nid) != 
+                    ksocknal_lib.libnal_ni.ni_pid.nid) {
                         /* This packet isn't for me */
                         ksocknal_fwd_parse (conn);
                         switch (conn->ksnc_rx_state) {
@@ -1547,7 +1491,13 @@ ksocknal_process_receive (ksock_conn_t *conn)
                 }
 
                 /* sets wanted_len, iovs etc */
-                lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn);
+                rc = lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn);
+
+                if (rc != PTL_OK) {
+                        /* I just received garbage: give up on this conn */
+                        ksocknal_close_conn_and_siblings (conn, rc);
+                        return (-EPROTO);
+                }
 
                 if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */
                         conn->ksnc_rx_state = SOCKNAL_RX_BODY;
@@ -1569,8 +1519,8 @@ ksocknal_process_receive (ksock_conn_t *conn)
         case SOCKNAL_RX_BODY_FWD:
                 /* payload all received */
                 CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n",
-                        conn, NTOH__u64 (conn->ksnc_hdr.src_nid),
-                        NTOH__u64 (conn->ksnc_hdr.dest_nid),
+                        conn, le64_to_cpu(conn->ksnc_hdr.src_nid),
+                        le64_to_cpu(conn->ksnc_hdr.dest_nid),
                         conn->ksnc_rx_nob_left);
 
                 /* forward the packet. NB ksocknal_init_fmb() put fmb into
@@ -1594,7 +1544,7 @@ ksocknal_process_receive (ksock_conn_t *conn)
 }
 
 ptl_err_t
-ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
+ksocknal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
                unsigned int niov, struct iovec *iov, 
                size_t offset, size_t mlen, size_t rlen)
 {
@@ -1622,7 +1572,7 @@ ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
 }
 
 ptl_err_t
-ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
+ksocknal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
                      unsigned int niov, ptl_kiov_t *kiov, 
                      size_t offset, size_t mlen, size_t rlen)
 {
@@ -1649,6 +1599,25 @@ ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
         return (PTL_OK);
 }
 
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+        unsigned long flags;
+        int           rc;
+
+        spin_lock_irqsave(&sched->kss_lock, flags);
+
+        rc = (!ksocknal_data.ksnd_shuttingdown &&
+#if SOCKNAL_ZC
+              list_empty(&sched->kss_zctxdone_list) &&
+#endif
+              list_empty(&sched->kss_rx_conns) &&
+              list_empty(&sched->kss_tx_conns));
+        
+        spin_unlock_irqrestore(&sched->kss_lock, flags);
+        return (rc);
+}
+
 int ksocknal_scheduler (void *arg)
 {
         ksock_sched_t     *sched = (ksock_sched_t *)arg;
@@ -1665,12 +1634,13 @@ int ksocknal_scheduler (void *arg)
         kportal_blockallsigs ();
 
 #if (CONFIG_SMP && CPU_AFFINITY)
+        id = ksocknal_sched2cpu(id);
         if (cpu_online(id)) {
                 cpumask_t m;
                 cpu_set(id, m);
                 set_cpus_allowed(current, m);
         } else {
-                CERROR ("Can't set CPU affinity for %s\n", name);
+                CERROR ("Can't set CPU affinity for %s to %d\n", name, id);
         }
 #endif /* CONFIG_SMP && CPU_AFFINITY */
         
@@ -1798,18 +1768,8 @@ int ksocknal_scheduler (void *arg)
                         nloops = 0;
 
                         if (!did_something) {   /* wait for something to do */
-#if SOCKNAL_ZC
                                 rc = wait_event_interruptible (sched->kss_waitq,
-                                                               ksocknal_data.ksnd_shuttingdown ||
-                                                               !list_empty(&sched->kss_rx_conns) ||
-                                                               !list_empty(&sched->kss_tx_conns) ||
-                                                               !list_empty(&sched->kss_zctxdone_list));
-#else
-                                rc = wait_event_interruptible (sched->kss_waitq,
-                                                               ksocknal_data.ksnd_shuttingdown ||
-                                                               !list_empty(&sched->kss_rx_conns) ||
-                                                               !list_empty(&sched->kss_tx_conns));
-#endif
+                                                               !ksocknal_sched_cansleep(sched));
                                 LASSERT (rc == 0);
                         } else
                                our_cond_resched();
@@ -1997,133 +1957,245 @@ ksocknal_sock_read (struct socket *sock, void *buffer, int nob)
 }
 
 int
-ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type,
-                __u64 *incarnation)
+ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs)
 {
-        int                 rc;
+        /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+        struct socket      *sock = conn->ksnc_sock;
         ptl_hdr_t           hdr;
         ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
-        char                ipbuf[PTL_NALFMT_SIZE];
-        char                ipbuf2[PTL_NALFMT_SIZE];
+        int                 i;
+        int                 rc;
 
-        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+        LASSERT (conn->ksnc_type != SOCKNAL_CONN_NONE);
+        LASSERT (nipaddrs <= SOCKNAL_MAX_INTERFACES);
 
-        memset (&hdr, 0, sizeof (hdr));
-        hmv->magic         = __cpu_to_le32 (PORTALS_PROTO_MAGIC);
-        hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
-        hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
+        /* No need for getconnsock/putconnsock */
+        LASSERT (!conn->ksnc_closing);
+
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+        hmv->magic         = cpu_to_le32 (PORTALS_PROTO_MAGIC);
+        hmv->version_major = cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
+        hmv->version_minor = cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
 
-        hdr.src_nid = __cpu_to_le64 (ksocknal_lib.ni.nid);
-        hdr.type    = __cpu_to_le32 (PTL_MSG_HELLO);
+        hdr.src_nid        = cpu_to_le64 (ksocknal_lib.libnal_ni.ni_pid.nid);
+        hdr.type           = cpu_to_le32 (PTL_MSG_HELLO);
+        hdr.payload_length = cpu_to_le32 (nipaddrs * sizeof(*ipaddrs));
 
-        hdr.msg.hello.type = __cpu_to_le32 (*type);
+        hdr.msg.hello.type = cpu_to_le32 (conn->ksnc_type);
         hdr.msg.hello.incarnation =
-                __cpu_to_le64 (ksocknal_data.ksnd_incarnation);
+                cpu_to_le64 (ksocknal_data.ksnd_incarnation);
 
-        /* Assume sufficient socket buffering for this message */
-        rc = ksocknal_sock_write (sock, &hdr, sizeof (hdr));
+        /* Receiver is eager */
+        rc = ksocknal_sock_write (sock, &hdr, sizeof(hdr));
         if (rc != 0) {
-                CERROR ("Error %d sending HELLO to "LPX64" %s\n",
-                        rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
+                CERROR ("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
                 return (rc);
         }
+        
+        if (nipaddrs == 0)
+                return (0);
+        
+        for (i = 0; i < nipaddrs; i++) {
+                ipaddrs[i] = __cpu_to_le32 (ipaddrs[i]);
+        }
+
+        rc = ksocknal_sock_write (sock, ipaddrs, nipaddrs * sizeof(*ipaddrs));
+        if (rc != 0)
+                CERROR ("Error %d sending HELLO payload (%d)"
+                        " to %u.%u.%u.%u/%d\n", rc, nipaddrs, 
+                        HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+        return (rc);
+}
+
+int
+ksocknal_invert_type(int type)
+{
+        switch (type)
+        {
+        case SOCKNAL_CONN_ANY:
+        case SOCKNAL_CONN_CONTROL:
+                return (type);
+        case SOCKNAL_CONN_BULK_IN:
+                return SOCKNAL_CONN_BULK_OUT;
+        case SOCKNAL_CONN_BULK_OUT:
+                return SOCKNAL_CONN_BULK_IN;
+        default:
+                return (SOCKNAL_CONN_NONE);
+        }
+}
+
+int
+ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
+                     __u64 *incarnation, __u32 *ipaddrs)
+{
+        struct socket      *sock = conn->ksnc_sock;
+        int                 rc;
+        int                 nips;
+        int                 i;
+        int                 type;
+        ptl_hdr_t           hdr;
+        ptl_magicversion_t *hmv;
+
+        hmv = (ptl_magicversion_t *)&hdr.dest_nid;
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
 
         rc = ksocknal_sock_read (sock, hmv, sizeof (*hmv));
         if (rc != 0) {
-                CERROR ("Error %d reading HELLO from "LPX64" %s\n",
-                        rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
+                CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr));
                 return (rc);
         }
 
-        if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) {
-                CERROR ("Bad magic %#08x (%#08x expected) from "LPX64" %s\n",
-                        __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, *nid,
-                        portals_nid2str(SOCKNAL, *nid, ipbuf));
+        if (hmv->magic != le32_to_cpu (PORTALS_PROTO_MAGIC)) {
+                CERROR ("Bad magic %#08x (%#08x expected) from %u.%u.%u.%u\n",
+                        __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC,
+                        HIPQUAD(conn->ksnc_ipaddr));
                 return (-EPROTO);
         }
 
-        if (hmv->version_major != __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) ||
-            hmv->version_minor != __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) {
+        if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) ||
+            hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) {
                 CERROR ("Incompatible protocol version %d.%d (%d.%d expected)"
-                        " from "LPX64" %s\n",
-                        __le16_to_cpu (hmv->version_major),
-                        __le16_to_cpu (hmv->version_minor),
+                        " from %u.%u.%u.%u\n",
+                        le16_to_cpu (hmv->version_major),
+                        le16_to_cpu (hmv->version_minor),
                         PORTALS_PROTO_VERSION_MAJOR,
                         PORTALS_PROTO_VERSION_MINOR,
-                        *nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
+                        HIPQUAD(conn->ksnc_ipaddr));
                 return (-EPROTO);
         }
 
-#if (PORTALS_PROTO_VERSION_MAJOR != 0)
-# error "This code only understands protocol version 0.x"
+#if (PORTALS_PROTO_VERSION_MAJOR != 1)
+# error "This code only understands protocol version 1.x"
 #endif
-        /* version 0 sends magic/version as the dest_nid of a 'hello' header,
-         * so read the rest of it in now... */
+        /* version 1 sends magic/version as the dest_nid of a 'hello'
+         * header, followed by payload full of interface IP addresses.
+         * Read the rest of it in now... */
 
         rc = ksocknal_sock_read (sock, hmv + 1, sizeof (hdr) - sizeof (*hmv));
         if (rc != 0) {
-                CERROR ("Error %d reading rest of HELLO hdr from "LPX64" %s\n",
-                        rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
+                CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr));
                 return (rc);
         }
 
         /* ...and check we got what we expected */
-        if (hdr.type != __cpu_to_le32 (PTL_MSG_HELLO) ||
-            hdr.payload_length != __cpu_to_le32 (0)) {
-                CERROR ("Expecting a HELLO hdr with 0 payload,"
-                        " but got type %d with %d payload from "LPX64" %s\n",
-                        __le32_to_cpu (hdr.type),
-                        __le32_to_cpu (hdr.payload_length), *nid,
-                        portals_nid2str(SOCKNAL, *nid, ipbuf));
+        if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) {
+                CERROR ("Expecting a HELLO hdr,"
+                        " but got type %d from %u.%u.%u.%u\n",
+                        le32_to_cpu (hdr.type),
+                        HIPQUAD(conn->ksnc_ipaddr));
                 return (-EPROTO);
         }
 
-        if (__le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) {
-                CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY\n");
+        if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) {
+                CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY"
+                       "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr));
                 return (-EPROTO);
         }
 
         if (*nid == PTL_NID_ANY) {              /* don't know peer's nid yet */
-                *nid = __le64_to_cpu(hdr.src_nid);
-        } else if (*nid != __le64_to_cpu (hdr.src_nid)) {
-                CERROR ("Connected to nid "LPX64" %s, but expecting "LPX64" %s\n",
-                        __le64_to_cpu (hdr.src_nid),
-                        portals_nid2str(SOCKNAL,
-                                        __le64_to_cpu(hdr.src_nid),
-                                        ipbuf),
-                        *nid, portals_nid2str(SOCKNAL, *nid, ipbuf2));
+                *nid = le64_to_cpu(hdr.src_nid);
+        } else if (*nid != le64_to_cpu (hdr.src_nid)) {
+                CERROR ("Connected to nid "LPX64"@%u.%u.%u.%u "
+                        "but expecting "LPX64"\n",
+                        le64_to_cpu (hdr.src_nid),
+                        HIPQUAD(conn->ksnc_ipaddr), *nid);
                 return (-EPROTO);
         }
 
-        if (*type == SOCKNAL_CONN_NONE) {
+        type = __le32_to_cpu(hdr.msg.hello.type);
+
+        if (conn->ksnc_type == SOCKNAL_CONN_NONE) {
                 /* I've accepted this connection; peer determines type */
-                *type = __le32_to_cpu(hdr.msg.hello.type);
-                switch (*type) {
-                case SOCKNAL_CONN_ANY:
-                case SOCKNAL_CONN_CONTROL:
-                        break;
-                case SOCKNAL_CONN_BULK_IN:
-                        *type = SOCKNAL_CONN_BULK_OUT;
-                        break;
-                case SOCKNAL_CONN_BULK_OUT:
-                        *type = SOCKNAL_CONN_BULK_IN;
-                        break;
-                default:
-                        CERROR ("Unexpected type %d from "LPX64" %s\n",
-                                *type, *nid,
-                                portals_nid2str(SOCKNAL, *nid, ipbuf));
+                conn->ksnc_type = ksocknal_invert_type(type);
+                if (conn->ksnc_type == SOCKNAL_CONN_NONE) {
+                        CERROR ("Unexpected type %d from "LPX64"@%u.%u.%u.%u\n",
+                                type, *nid, HIPQUAD(conn->ksnc_ipaddr));
                         return (-EPROTO);
                 }
-        } else if (__le32_to_cpu(hdr.msg.hello.type) != SOCKNAL_CONN_NONE) {
-                CERROR ("Mismatched types: me %d "LPX64" %s %d\n",
-                        *type, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf),
-                        __le32_to_cpu(hdr.msg.hello.type));
+        } else if (ksocknal_invert_type(type) != conn->ksnc_type) {
+                CERROR ("Mismatched types: me %d, "LPX64"@%u.%u.%u.%u %d\n",
+                        conn->ksnc_type, *nid, HIPQUAD(conn->ksnc_ipaddr),
+                        le32_to_cpu(hdr.msg.hello.type));
                 return (-EPROTO);
         }
 
-        *incarnation = __le64_to_cpu(hdr.msg.hello.incarnation);
+        *incarnation = le64_to_cpu(hdr.msg.hello.incarnation);
 
-        return (0);
+        nips = __le32_to_cpu (hdr.payload_length) / sizeof (__u32);
+
+        if (nips > SOCKNAL_MAX_INTERFACES ||
+            nips * sizeof(__u32) != __le32_to_cpu (hdr.payload_length)) {
+                CERROR("Bad payload length %d from "LPX64"@%u.%u.%u.%u\n",
+                       __le32_to_cpu (hdr.payload_length),
+                       *nid, HIPQUAD(conn->ksnc_ipaddr));
+        }
+
+        if (nips == 0)
+                return (0);
+        
+        rc = ksocknal_sock_read (sock, ipaddrs, nips * sizeof(*ipaddrs));
+        if (rc != 0) {
+                CERROR ("Error %d reading IPs from "LPX64"@%u.%u.%u.%u\n",
+                        rc, *nid, HIPQUAD(conn->ksnc_ipaddr));
+                return (rc);
+        }
+
+        for (i = 0; i < nips; i++) {
+                ipaddrs[i] = __le32_to_cpu(ipaddrs[i]);
+                
+                if (ipaddrs[i] == 0) {
+                        CERROR("Zero IP[%d] from "LPX64"@%u.%u.%u.%u\n",
+                               i, *nid, HIPQUAD(conn->ksnc_ipaddr));
+                        return (-EPROTO);
+                }
+        }
+
+        return (nips);
+}
+
+int
+ksocknal_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+        mm_segment_t   oldmm = get_fs ();
+        struct socket *sock = conn->ksnc_sock;
+        int            len;
+        int            rc;
+
+        rc = ksocknal_getconnsock (conn);
+        if (rc != 0) {
+                LASSERT (conn->ksnc_closing);
+                *txmem = *rxmem = *nagle = 0;
+                return (-ESHUTDOWN);
+        }
+        
+        set_fs (KERNEL_DS);
+
+        len = sizeof(*txmem);
+        rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+                             (char *)txmem, &len);
+        if (rc == 0) {
+                len = sizeof(*rxmem);
+                rc = sock_getsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+                                     (char *)rxmem, &len);
+        }
+        if (rc == 0) {
+                len = sizeof(*nagle);
+                rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
+                                           (char *)nagle, &len);
+        }
+
+        set_fs (oldmm);
+        ksocknal_putconnsock (conn);
+
+        if (rc == 0)
+                *nagle = !*nagle;
+        else
+                *txmem = *rxmem = *nagle = 0;
+                
+        return (rc);
 }
 
 int
@@ -2132,13 +2204,13 @@ ksocknal_setup_sock (struct socket *sock)
         mm_segment_t    oldmm = get_fs ();
         int             rc;
         int             option;
+        int             keep_idle;
+        int             keep_intvl;
+        int             keep_count;
+        int             do_keepalive;
         struct linger   linger;
 
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
         sock->sk->sk_allocation = GFP_NOFS;
-#else
-        sock->sk->allocation = GFP_NOFS;
-#endif
 
         /* Ensure this socket aborts active sends immediately when we close
          * it. */
@@ -2165,55 +2237,95 @@ ksocknal_setup_sock (struct socket *sock)
                 return (rc);
         }
 
-#if SOCKNAL_USE_KEEPALIVES
-        /* Keepalives: If 3/4 of the timeout elapses, start probing every
-         * second until the timeout elapses. */
+        if (!ksocknal_tunables.ksnd_nagle) {
+                option = 1;
+                
+                set_fs (KERNEL_DS);
+                rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY,
+                                            (char *)&option, sizeof (option));
+                set_fs (oldmm);
+                if (rc != 0) {
+                        CERROR ("Can't disable nagle: %d\n", rc);
+                        return (rc);
+                }
+        }
+        
+        if (ksocknal_tunables.ksnd_buffer_size > 0) {
+                option = ksocknal_tunables.ksnd_buffer_size;
+                
+                set_fs (KERNEL_DS);
+                rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF,
+                                      (char *)&option, sizeof (option));
+                set_fs (oldmm);
+                if (rc != 0) {
+                        CERROR ("Can't set send buffer %d: %d\n",
+                                option, rc);
+                        return (rc);
+                }
+
+                set_fs (KERNEL_DS);
+                rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
+                                      (char *)&option, sizeof (option));
+                set_fs (oldmm);
+                if (rc != 0) {
+                        CERROR ("Can't set receive buffer %d: %d\n",
+                                option, rc);
+                        return (rc);
+                }
+        }
+
+        /* snapshot tunables */
+        keep_idle  = ksocknal_tunables.ksnd_keepalive_idle;
+        keep_count = ksocknal_tunables.ksnd_keepalive_count;
+        keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl;
+        
+        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
 
-        option = (ksocknal_data.ksnd_io_timeout * 3) / 4;
+        option = (do_keepalive ? 1 : 0);
         set_fs (KERNEL_DS);
-        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
-                                    (char *)&option, sizeof (option));
+        rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, 
+                              (char *)&option, sizeof (option));
         set_fs (oldmm);
         if (rc != 0) {
-                CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
+                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
                 return (rc);
         }
-        
-        option = 1;
+
+        if (!do_keepalive)
+                return (0);
+
         set_fs (KERNEL_DS);
-        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
-                                    (char *)&option, sizeof (option));
+        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
+                                    (char *)&keep_idle, sizeof (keep_idle));
         set_fs (oldmm);
         if (rc != 0) {
-                CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+                CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
                 return (rc);
         }
-        
-        option = ksocknal_data.ksnd_io_timeout / 4;
+
         set_fs (KERNEL_DS);
-        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
-                                    (char *)&option, sizeof (option));
+        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
+                                    (char *)&keep_intvl, sizeof (keep_intvl));
         set_fs (oldmm);
         if (rc != 0) {
                 CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
                 return (rc);
         }
 
-        option = 1;
         set_fs (KERNEL_DS);
-        rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, 
-                              (char *)&option, sizeof (option));
+        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
+                                    (char *)&keep_count, sizeof (keep_count));
         set_fs (oldmm);
         if (rc != 0) {
-                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+                CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
                 return (rc);
         }
-#endif
+
         return (0);
 }
 
 static int
-ksocknal_connect_sock(struct socket **sockp, int *may_retry,
+ksocknal_connect_sock(struct socket **sockp, int *may_retry, 
                       ksock_route_t *route, int local_port)
 {
         struct sockaddr_in  locaddr;
@@ -2227,7 +2339,9 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry,
         memset(&locaddr, 0, sizeof(locaddr)); 
         locaddr.sin_family = AF_INET; 
         locaddr.sin_port = htons(local_port);
-        locaddr.sin_addr.s_addr = INADDR_ANY;
+        locaddr.sin_addr.s_addr = 
+                (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) 
+                                            : INADDR_ANY;
  
         memset (&srvaddr, 0, sizeof (srvaddr));
         srvaddr.sin_family = AF_INET;
@@ -2266,7 +2380,7 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry,
 
         /* Set the socket timeouts, so our connection attempt completes in
          * finite time */
-        tv.tv_sec = ksocknal_data.ksnd_io_timeout;
+        tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
         tv.tv_usec = 0;
 
         set_fs (KERNEL_DS);
@@ -2274,8 +2388,8 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry,
                               (char *)&tv, sizeof (tv));
         set_fs (oldmm);
         if (rc != 0) {
-                CERROR ("Can't set send timeout %d: %d\n",
-                        ksocknal_data.ksnd_io_timeout, rc);
+                CERROR ("Can't set send timeout %d: %d\n", 
+                        ksocknal_tunables.ksnd_io_timeout, rc);
                 goto failed;
         }
         
@@ -2285,12 +2399,12 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry,
         set_fs (oldmm);
         if (rc != 0) {
                 CERROR ("Can't set receive timeout %d: %d\n",
-                        ksocknal_data.ksnd_io_timeout, rc);
+                        ksocknal_tunables.ksnd_io_timeout, rc);
                 goto failed;
         }
 
-        option = 1;
         set_fs (KERNEL_DS);
+        option = 1;
         rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, 
                              (char *)&option, sizeof (option)); 
         set_fs (oldmm);
@@ -2298,29 +2412,6 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry,
                 CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
                 goto failed;
         }
-        
-        if (route->ksnr_buffer_size != 0) {
-                option = route->ksnr_buffer_size;
-                set_fs (KERNEL_DS);
-                rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF,
-                                      (char *)&option, sizeof (option));
-                set_fs (oldmm);
-                if (rc != 0) {
-                        CERROR ("Can't set send buffer %d: %d\n",
-                                route->ksnr_buffer_size, rc);
-                        goto failed;
-                }
-
-                set_fs (KERNEL_DS);
-                rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
-                                      (char *)&option, sizeof (option));
-                set_fs (oldmm);
-                if (rc != 0) {
-                        CERROR ("Can't set receive buffer %d: %d\n",
-                                route->ksnr_buffer_size, rc);
-                        goto failed;
-                }
-        }
 
         rc = sock->ops->bind(sock, 
                              (struct sockaddr *)&locaddr, sizeof(locaddr));
@@ -2348,7 +2439,8 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry,
         *may_retry = (rc == -EADDRNOTAVAIL);
 
         CDEBUG(*may_retry ? D_NET : D_ERROR,
-               "Error %d connecting to %u.%u.%u.%u/%d\n", rc,
+               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+               HIPQUAD(route->ksnr_myipaddr), local_port,
                HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
 
  failed:
@@ -2374,12 +2466,11 @@ ksocknal_connect_peer (ksock_route_t *route, int type)
                 rc = ksocknal_connect_sock(&sock, &may_retry, route, port);
 
                 if (rc == 0) {
-                        rc = ksocknal_create_conn(route, sock,
-                                                  route->ksnr_irq_affinity, type);
+                        rc = ksocknal_create_conn(route, sock, type);
                         fput(sock->file);
                         return rc;
                 }
-                
+
                 if (!may_retry)
                         return rc;
         }
@@ -2405,7 +2496,6 @@ ksocknal_autoconnect (ksock_route_t *route)
                 LASSERT (type < SOCKNAL_CONN_NTYPES);
 
                 rc = ksocknal_connect_peer (route, type);
-
                 if (rc != 0)
                         break;
                 
@@ -2445,12 +2535,13 @@ ksocknal_autoconnect (ksock_route_t *route)
                 } while (!list_empty (&peer->ksnp_tx_queue));
         }
 
-        /* make this route least-favourite for re-selection */
+#if 0           /* irrelevent with only eager routes */
         if (!route->ksnr_deleted) {
+                /* make this route least-favourite for re-selection */
                 list_del(&route->ksnr_list);
                 list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
         }
-        
+#endif        
         write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
 
         while (!list_empty (&zombies)) {
@@ -2459,15 +2550,15 @@ ksocknal_autoconnect (ksock_route_t *route)
                 tx = list_entry (zombies.next, ksock_tx_t, tx_list);
 
                 CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n",
-                        NTOH__u32 (tx->tx_hdr->type),
-                        NTOH__u32 (tx->tx_hdr->payload_length),
-                        NTOH__u64 (tx->tx_hdr->src_nid),
+                        le32_to_cpu (tx->tx_hdr->type),
+                        le32_to_cpu (tx->tx_hdr->payload_length),
+                        le64_to_cpu (tx->tx_hdr->src_nid),
                         portals_nid2str(SOCKNAL,
-                                        NTOH__u64(tx->tx_hdr->src_nid),
+                                        le64_to_cpu(tx->tx_hdr->src_nid),
                                         ipbuf),
-                        NTOH__u64 (tx->tx_hdr->dest_nid),
+                        le64_to_cpu (tx->tx_hdr->dest_nid),
                         portals_nid2str(SOCKNAL,
-                                        NTOH__u64(tx->tx_hdr->src_nid),
+                                        le64_to_cpu(tx->tx_hdr->src_nid),
                                         ipbuf2));
 
                 list_del (&tx->tx_list);
@@ -2496,24 +2587,26 @@ ksocknal_autoconnectd (void *arg)
                 if (!list_empty (&ksocknal_data.ksnd_autoconnectd_routes)) {
                         route = list_entry (ksocknal_data.ksnd_autoconnectd_routes.next,
                                             ksock_route_t, ksnr_connect_list);
-                        
+
                         list_del (&route->ksnr_connect_list);
                         spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags);
 
                         ksocknal_autoconnect (route);
                         ksocknal_put_route (route);
 
-                        spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags);
+                        spin_lock_irqsave(&ksocknal_data.ksnd_autoconnectd_lock,
+                                          flags);
                         continue;
                 }
-                
-                spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags);
 
-                rc = wait_event_interruptible (ksocknal_data.ksnd_autoconnectd_waitq,
-                                               ksocknal_data.ksnd_shuttingdown ||
-                                               !list_empty (&ksocknal_data.ksnd_autoconnectd_routes));
+                spin_unlock_irqrestore(&ksocknal_data.ksnd_autoconnectd_lock,
+                                       flags);
 
-                spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags);
+                rc = wait_event_interruptible(ksocknal_data.ksnd_autoconnectd_waitq,
+                                              ksocknal_data.ksnd_shuttingdown ||
+                                              !list_empty(&ksocknal_data.ksnd_autoconnectd_routes));
+
+                spin_lock_irqsave(&ksocknal_data.ksnd_autoconnectd_lock, flags);
         }
 
         spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags);
@@ -2528,32 +2621,39 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
         /* We're called with a shared lock on ksnd_global_lock */
         ksock_conn_t      *conn;
         struct list_head  *ctmp;
-        ksock_sched_t     *sched;
 
         list_for_each (ctmp, &peer->ksnp_conns) {
                 conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
-                sched = conn->ksnc_scheduler;
 
                 /* Don't need the {get,put}connsock dance to deref ksnc_sock... */
                 LASSERT (!conn->ksnc_closing);
-                
+
+                if (conn->ksnc_sock->sk->sk_err != 0) {
+                        /* Something (e.g. failed keepalive) set the socket error */
+                        atomic_inc (&conn->ksnc_refcount);
+                        CERROR ("Socket error %d: "LPX64" %p %d.%d.%d.%d\n",
+                                conn->ksnc_sock->sk->sk_err, peer->ksnp_nid,
+                                conn, HIPQUAD(conn->ksnc_ipaddr));
+                        return (conn);
+                }
+
                 if (conn->ksnc_rx_started &&
                     time_after_eq (jiffies, conn->ksnc_rx_deadline)) {
                         /* Timed out incomplete incoming message */
                         atomic_inc (&conn->ksnc_refcount);
                         CERROR ("Timed out RX from "LPX64" %p %d.%d.%d.%d\n",
-                                peer->ksnp_nid, conn, HIPQUAD(conn->ksnc_ipaddr));
+                                peer->ksnp_nid,conn,HIPQUAD(conn->ksnc_ipaddr));
                         return (conn);
                 }
-                
+
                 if ((!list_empty (&conn->ksnc_tx_queue) ||
                      conn->ksnc_sock->sk->sk_wmem_queued != 0) &&
                     time_after_eq (jiffies, conn->ksnc_tx_deadline)) {
-                        /* Timed out messages queued for sending, or
-                         * messages buffered in the socket's send buffer */
+                        /* Timed out messages queued for sending or
+                         * buffered in the socket's send buffer */
                         atomic_inc (&conn->ksnc_refcount);
-                        CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n", 
-                                peer->ksnp_nid, 
+                        CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n",
+                                peer->ksnp_nid,
                                 list_empty (&conn->ksnc_tx_queue) ? "" : "Q ",
                                 conn->ksnc_sock->sk->sk_wmem_queued, conn,
                                 HIPQUAD(conn->ksnc_ipaddr));
@@ -2693,9 +2793,9 @@ ksocknal_reaper (void *arg)
                          * timeout on any connection within (n+1)/n times the
                          * timeout interval. */
 
-                        if (ksocknal_data.ksnd_io_timeout > n * p)
+                        if (ksocknal_tunables.ksnd_io_timeout > n * p)
                                 chunk = (chunk * n * p) / 
-                                        ksocknal_data.ksnd_io_timeout;
+                                        ksocknal_tunables.ksnd_io_timeout;
                         if (chunk == 0)
                                 chunk = 1;
 
@@ -2716,8 +2816,8 @@ ksocknal_reaper (void *arg)
                 }
                 ksocknal_data.ksnd_reaper_waketime = jiffies + timeout;
 
-                add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
                 set_current_state (TASK_INTERRUPTIBLE);
+                add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
 
                 if (!ksocknal_data.ksnd_shuttingdown &&
                     list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
@@ -2736,18 +2836,11 @@ ksocknal_reaper (void *arg)
         return (0);
 }
 
-nal_cb_t ksocknal_lib = {
-        nal_data:       &ksocknal_data,                /* NAL private data */
-        cb_send:         ksocknal_send,
-        cb_send_pages:   ksocknal_send_pages,
-        cb_recv:         ksocknal_recv,
-        cb_recv_pages:   ksocknal_recv_pages,
-        cb_read:         ksocknal_read,
-        cb_write:        ksocknal_write,
-        cb_malloc:       ksocknal_malloc,
-        cb_free:         ksocknal_free,
-        cb_printf:       ksocknal_printf,
-        cb_cli:          ksocknal_cli,
-        cb_sti:          ksocknal_sti,
-        cb_dist:         ksocknal_dist
+lib_nal_t ksocknal_lib = {
+        libnal_data:       &ksocknal_data,      /* NAL private data */
+        libnal_send:        ksocknal_send,
+        libnal_send_pages:  ksocknal_send_pages,
+        libnal_recv:        ksocknal_recv,
+        libnal_recv_pages:  ksocknal_recv_pages,
+        libnal_dist:        ksocknal_dist
 };
index df12db6..c6f0aa4 100644 (file)
@@ -4,7 +4,6 @@ link-stamp
 .*.cmd
 autoMakefile.in
 autoMakefile
-sources
 *.ko
 *.mod.c
 .*.flags
index 598adc1..0967123 100644 (file)
@@ -1,9 +1,4 @@
-MODULES = portals
+MODULES = libcfs
 libcfs-objs := debug.o lwt.o module.o proc.o tracefile.o
 
-api-sources := $(wildcard @LUSTRE@/portals/portals/api-*.c)
-lib-sources := $(wildcard @LUSTRE@/portals/portals/lib-*.c)
-
-portals-objs += $(libcfs-objs) $(patsubst %.c,%.o,$(notdir $(api-sources) $(lib-sources)))
-
 @INCLUDE_RULES@
index cacd769..9c27693 100644 (file)
@@ -4,17 +4,8 @@
 # See the file COPYING in this distribution
 
 if MODULES
-modulenet_DATA := portals$(KMODEXT)
+modulenet_DATA := libcfs$(KMODEXT)
 endif
 
-sources:
-       rm -f sources
-       @for i in $(api-sources) $(lib-sources) ; do \
-               echo ln -sf $$i . ; \
-               ln -sf $$i . || exit 1 ; \
-       done
-       touch sources
-
 MOSTLYCLEANFILES = *.o *.ko *.mod.c
-CLEANFILES = sources lib-*.c api-*.c
-DIST_SOURCES = $(libcfs-objs:%.o=%.c) *.h
+DIST_SOURCES = $(libcfs-objs:%.o=%.c) tracefile.h
index 3e5531a..f571958 100644 (file)
@@ -191,7 +191,8 @@ int portals_debug_mark_buffer(char *text)
 
 void portals_debug_set_level(unsigned int debug_level)
 {
-        printk("Lustre: Setting portals debug level to %08x\n", debug_level);
+        printk(KERN_WARNING "Lustre: Setting portals debug level to %08x\n",
+               debug_level);
         portal_debug = debug_level;
 }
 
@@ -250,31 +251,47 @@ void portals_run_lbug_upcall(char *file, const char *fn, const int line)
 
 char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
 {
+        if (nid == PTL_NID_ANY) {
+                snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY");
+                return str;
+        }
+
         switch(nal){
 /* XXX this could be a nal method of some sort, 'cept it's config
  * dependent whether (say) socknal NIDs are actually IP addresses... */
-#ifndef CRAY_PORTALS 
+#if !CRAY_PORTALS 
         case TCPNAL:
                 /* userspace NAL */
+        case IIBNAL:
+        case OPENIBNAL:
         case SOCKNAL:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u",
+                snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u",
                          (__u32)(nid >> 32), HIPQUAD(nid));
                 break;
         case QSWNAL:
         case GMNAL:
-        case IBNAL:
-        case SCIMACNAL:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u",
+                snprintf(str, PTL_NALFMT_SIZE, "%u:%u",
                          (__u32)(nid >> 32), (__u32)nid);
                 break;
 #endif
         default:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx",
+                snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx",
                          nal, (long long)nid);
+                break;
         }
         return str;
 }
 
+char *portals_id2str(int nal, ptl_process_id_t id, char *str)
+{
+        int   len;
+        
+        portals_nid2str(nal, id.nid, str);
+        len = strlen(str);
+        snprintf(str + len, PTL_NALFMT_SIZE, "-%u", id.pid);
+        return str;
+}
+
 #ifdef __KERNEL__
 char stack_backtrace[LUSTRE_TRACE_SIZE];
 spinlock_t stack_backtrace_lock = SPIN_LOCK_UNLOCKED;
@@ -350,7 +367,9 @@ out:
 
 char *portals_debug_dumpstack(void)
 {
-        return "dump_stack\n";
+        char *buf = stack_backtrace;
+        buf[0] = '\0';
+        return buf;
 }
 
 #endif /* __arch_um__ */
@@ -370,3 +389,4 @@ EXPORT_SYMBOL(portals_debug_set_level);
 EXPORT_SYMBOL(portals_run_upcall);
 EXPORT_SYMBOL(portals_run_lbug_upcall);
 EXPORT_SYMBOL(portals_nid2str);
+EXPORT_SYMBOL(portals_id2str);
index a24423e..3f6a9c2 100644 (file)
 
 #if LWT_SUPPORT
 
+#if !KLWT_SUPPORT
 int         lwt_enabled;
+lwt_cpu_t   lwt_cpus[NR_CPUS];
+#endif
+
 int         lwt_pages_per_cpu;
-lwt_cpu_t   lwt_cpus[LWT_MAX_CPUS];
 
 /* NB only root is allowed to retrieve LWT info; it's an open door into the
  * kernel... */
@@ -97,23 +100,35 @@ lwt_control (int enable, int clear)
         if (!capable(CAP_SYS_ADMIN))
                 return (-EPERM);
 
-        if (clear)
-                for (i = 0; i < num_online_cpus(); i++) {
-                        p = lwt_cpus[i].lwtc_current_page;
+        if (!enable) {
+                LWT_EVENT(0,0,0,0);
+                lwt_enabled = 0;
+                mb();
+                /* give people some time to stop adding traces */
+                schedule_timeout(10);
+        }
 
-                        for (j = 0; j < lwt_pages_per_cpu; j++) {
-                                memset (p->lwtp_events, 0, PAGE_SIZE);
+        for (i = 0; i < num_online_cpus(); i++) {
+                p = lwt_cpus[i].lwtc_current_page;
 
-                                p = list_entry (p->lwtp_list.next,
-                                                lwt_page_t, lwtp_list);
-                        }
+                if (p == NULL)
+                        return (-ENODATA);
+
+                if (!clear)
+                        continue;
+
+                for (j = 0; j < lwt_pages_per_cpu; j++) {
+                        memset (p->lwtp_events, 0, PAGE_SIZE);
+
+                        p = list_entry (p->lwtp_list.next,
+                                        lwt_page_t, lwtp_list);
+                }
         }
 
-        lwt_enabled = enable;
-        mb();
-        if (!enable) {
-                /* give people some time to stop adding traces */
-                schedule_timeout(10);
+        if (enable) {
+                lwt_enabled = 1;
+                mb();
+                LWT_EVENT(0,0,0,0);
         }
 
         return (0);
@@ -141,6 +156,9 @@ lwt_snapshot (cycles_t *now, int *ncpu, int *total_size,
 
         for (i = 0; i < num_online_cpus(); i++) {
                 p = lwt_cpus[i].lwtc_current_page;
+
+                if (p == NULL)
+                        return (-ENODATA);
                 
                 for (j = 0; j < lwt_pages_per_cpu; j++) {
                         if (copy_to_user(user_ptr, p->lwtp_events,
@@ -162,11 +180,12 @@ lwt_init ()
 {
        int     i;
         int     j;
+
+        for (i = 0; i < num_online_cpus(); i++)
+                if (lwt_cpus[i].lwtc_current_page != NULL)
+                        return (-EALREADY);
         
-        if (num_online_cpus() > LWT_MAX_CPUS) {
-                CERROR ("Too many CPUs\n");
-                return (-EINVAL);
-        }
+        LASSERT (!lwt_enabled);
 
        /* NULL pointers, zero scalars */
        memset (lwt_cpus, 0, sizeof (lwt_cpus));
@@ -207,6 +226,8 @@ lwt_init ()
         lwt_enabled = 1;
         mb();
 
+        LWT_EVENT(0,0,0,0);
+
         return (0);
 }
 
@@ -214,10 +235,9 @@ void
 lwt_fini () 
 {
         int    i;
-        
-        if (num_online_cpus() > LWT_MAX_CPUS)
-                return;
 
+        lwt_control(0, 0);
+        
         for (i = 0; i < num_online_cpus(); i++)
                 while (lwt_cpus[i].lwtc_current_page != NULL) {
                         lwt_page_t *lwtp = lwt_cpus[i].lwtc_current_page;
index f1d086b..a2422e3 100644 (file)
 #include <portals/lib-p30.h>
 #include <portals/p30.h>
 #include <linux/kp30.h>
-#include <linux/kpr.h>
 #include <linux/portals_compat25.h>
 
 #define PORTAL_MINOR 240
 
-extern void (kping_client)(struct portal_ioctl_data *);
-
 struct nal_cmd_handler {
-        nal_cmd_handler_t nch_handler;
-        void * nch_private;
+        int                  nch_number;
+        nal_cmd_handler_fn  *nch_handler;
+        void                *nch_private;
 };
 
-static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1];
+static struct nal_cmd_handler nal_cmd[16];
 static DECLARE_MUTEX(nal_cmd_sem);
 
 #ifdef PORTAL_DEBUG
@@ -204,7 +202,7 @@ kportal_blockallsigs ()
 }
 
 /* called when opening /dev/device */
-static int kportal_psdev_open(struct inode * inode, struct file * file)
+static int libcfs_psdev_open(struct inode * inode, struct file * file)
 {
         struct portals_device_userstate *pdu;
         ENTRY;
@@ -225,7 +223,7 @@ static int kportal_psdev_open(struct inode * inode, struct file * file)
 }
 
 /* called when closing /dev/device */
-static int kportal_psdev_release(struct inode * inode, struct file * file)
+static int libcfs_psdev_release(struct inode * inode, struct file * file)
 {
         struct portals_device_userstate *pdu;
         ENTRY;
@@ -248,265 +246,139 @@ static inline void freedata(void *data, int len)
         PORTAL_FREE(data, len);
 }
 
-static int
-kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, 
-                  ptl_nid_t lo_nid, ptl_nid_t hi_nid)
+struct nal_cmd_handler *
+libcfs_find_nal_cmd_handler(int nal)
 {
-        int rc;
-        kpr_control_interface_t *ci;
-
-        ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET (kpr_control_interface);
-        if (ci == NULL)
-                return (-ENODEV);
+        int    i;
 
-        rc = ci->kprci_add_route (gateway_nalid, gateway_nid, lo_nid, hi_nid);
+        for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++)
+                if (nal_cmd[i].nch_handler != NULL &&
+                    nal_cmd[i].nch_number == nal)
+                        return (&nal_cmd[i]);
 
-        PORTAL_SYMBOL_PUT(kpr_control_interface);
-        return (rc);
+        return (NULL);
 }
 
-static int
-kportal_del_route(int gw_nalid, ptl_nid_t gw_nid, 
-                  ptl_nid_t lo, ptl_nid_t hi)
-{
-        int rc;
-        kpr_control_interface_t *ci;
-
-        ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface);
-        if (ci == NULL)
-                return (-ENODEV);
-
-        rc = ci->kprci_del_route (gw_nalid, gw_nid, lo, hi);
-
-        PORTAL_SYMBOL_PUT(kpr_control_interface);
-        return (rc);
-}
-
-static int
-kportal_notify_router (int gw_nalid, ptl_nid_t gw_nid,
-                       int alive, time_t when)
+int
+libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *private)
 {
-        int rc;
-        kpr_control_interface_t *ci;
-
-        /* No error if router not preset.  Sysadmin is allowed to notify
-         * _everywhere_ when a NID boots or crashes, even if they know
-         * nothing of the peer. */
-        ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface);
-        if (ci == NULL)
-                return (0);
+        struct nal_cmd_handler *cmd;
+        int                     i;
+        int                     rc;
 
-        rc = ci->kprci_notify (gw_nalid, gw_nid, alive, when);
-
-        PORTAL_SYMBOL_PUT(kpr_control_interface);
-        return (rc);
-}
-
-static int
-kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp,
-                  ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp, int *alivep)
-{
-        int       gateway_nalid;
-        ptl_nid_t gateway_nid;
-        ptl_nid_t lo_nid;
-        ptl_nid_t hi_nid;
-        int       alive;
-        int       rc;
-        kpr_control_interface_t *ci;
+        CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler);
 
-        ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET(kpr_control_interface);
-        if (ci == NULL)
-                return (-ENODEV);
+        down(&nal_cmd_sem);
 
-        rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid,
-                                 &lo_nid, &hi_nid, &alive);
+        if (libcfs_find_nal_cmd_handler(nal) != NULL) {
+                up (&nal_cmd_sem);
+                return (-EBUSY);
+        }
 
-        if (rc == 0) {
-                CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64", %s\n",
-                       index, gateway_nalid, gateway_nid, lo_nid, hi_nid,
-                       alive ? "up" : "down");
-
-                *gateway_nalidp = (__u32)gateway_nalid;
-                *gateway_nidp   = gateway_nid;
-                *lo_nidp        = lo_nid;
-                *hi_nidp        = hi_nid;
-                *alivep         = alive;
+        cmd = NULL;
+        for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++)
+                if (nal_cmd[i].nch_handler == NULL) {
+                        cmd = &nal_cmd[i];
+                        break;
+                }
+        
+        if (cmd == NULL) {
+                rc = -EBUSY;
+        } else {
+                rc = 0;
+                cmd->nch_number = nal;
+                cmd->nch_handler = handler;
+                cmd->nch_private = private;
         }
 
-        PORTAL_SYMBOL_PUT (kpr_control_interface);
-        return (rc);
+        up(&nal_cmd_sem);
+
+        return rc;
 }
+EXPORT_SYMBOL(libcfs_nal_cmd_register);
 
-static int 
-kportal_router_cmd(struct portals_cfg *pcfg, void * private)
+void
+libcfs_nal_cmd_unregister(int nal)
 {
-        int err = -EINVAL;
-        ENTRY;
-
-        switch(pcfg->pcfg_command) {
-        default:
-                CDEBUG(D_IOCTL, "Inappropriate cmd: %d\n", pcfg->pcfg_command);
-                break;
-                
-        case NAL_CMD_ADD_ROUTE:
-                CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n",
-                       pcfg->pcfg_nal, pcfg->pcfg_nid, 
-                       pcfg->pcfg_nid2, pcfg->pcfg_nid3);
-                err = kportal_add_route(pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
-                                        pcfg->pcfg_nid2, pcfg->pcfg_nid3);
-                break;
+        struct nal_cmd_handler *cmd;
 
-        case NAL_CMD_DEL_ROUTE:
-                CDEBUG (D_IOCTL, "Removing routes via [%d] "LPU64" : "LPU64" - "LPU64"\n",
-                        pcfg->pcfg_gw_nal, pcfg->pcfg_nid, 
-                        pcfg->pcfg_nid2, pcfg->pcfg_nid3);
-                err = kportal_del_route (pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
-                                         pcfg->pcfg_nid2, pcfg->pcfg_nid3);
-                break;
+        CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal);
 
-        case NAL_CMD_NOTIFY_ROUTER: {
-                CDEBUG (D_IOCTL, "Notifying peer [%d] "LPU64" %s @ %ld\n",
-                        pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
-                        pcfg->pcfg_flags ? "Enabling" : "Disabling",
-                        (time_t)pcfg->pcfg_nid3);
-                
-                err = kportal_notify_router (pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
-                                             pcfg->pcfg_flags, 
-                                             (time_t)pcfg->pcfg_nid3);
-                break;
-        }
-                
-        case NAL_CMD_GET_ROUTE:
-                CDEBUG (D_IOCTL, "Getting route [%d]\n", pcfg->pcfg_count);
-                err = kportal_get_route(pcfg->pcfg_count, &pcfg->pcfg_gw_nal,
-                                        &pcfg->pcfg_nid, 
-                                        &pcfg->pcfg_nid2, &pcfg->pcfg_nid3,
-                                        &pcfg->pcfg_flags);
-                break;
-        }
-        RETURN(err);
+        down(&nal_cmd_sem);
+        cmd = libcfs_find_nal_cmd_handler(nal);
+        LASSERT (cmd != NULL);
+        cmd->nch_handler = NULL;
+        cmd->nch_private = NULL;
+        up(&nal_cmd_sem);
 }
+EXPORT_SYMBOL(libcfs_nal_cmd_unregister);
 
 int
-kportal_nal_cmd(struct portals_cfg *pcfg)
+libcfs_nal_cmd(struct portals_cfg *pcfg)
 {
+        struct nal_cmd_handler *cmd;
         __u32 nal = pcfg->pcfg_nal;
-        int rc = -EINVAL;
-
+        int   rc = -EINVAL;
         ENTRY;
 
         down(&nal_cmd_sem);
-        if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) {
+        cmd = libcfs_find_nal_cmd_handler(nal);
+        if (cmd != NULL) {
                 CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, 
                        pcfg->pcfg_command);
-                rc = nal_cmd[nal].nch_handler(pcfg, nal_cmd[nal].nch_private);
+                rc = cmd->nch_handler(pcfg, cmd->nch_private);
         } else {
                 CERROR("invalid nal: %d, cmd: %d\n", nal, pcfg->pcfg_command);
         }
         up(&nal_cmd_sem);
-        RETURN(rc);
-}
-
-ptl_handle_ni_t *
-kportal_get_ni (int nal)
-{
 
-        switch (nal)
-        {
-        case QSWNAL:
-                return (PORTAL_SYMBOL_GET(kqswnal_ni));
-        case SOCKNAL:
-                return (PORTAL_SYMBOL_GET(ksocknal_ni));
-        case GMNAL:
-                return  (PORTAL_SYMBOL_GET(kgmnal_ni));
-        case IBNAL:
-                return  (PORTAL_SYMBOL_GET(kibnal_ni));
-        case TCPNAL:
-                /* userspace NAL */
-                return (NULL);
-        case SCIMACNAL:
-                return  (PORTAL_SYMBOL_GET(kscimacnal_ni));
-        default:
-                /* A warning to a naive caller */
-                CERROR ("unknown nal: %d\n", nal);
-                return (NULL);
-        }
+        RETURN(rc);
 }
+EXPORT_SYMBOL(libcfs_nal_cmd);
 
-void
-kportal_put_ni (int nal)
-{
+static DECLARE_RWSEM(ioctl_list_sem);
+static LIST_HEAD(ioctl_list);
 
-        switch (nal)
-        {
-        case QSWNAL:
-                PORTAL_SYMBOL_PUT(kqswnal_ni);
-                break;
-        case SOCKNAL:
-                PORTAL_SYMBOL_PUT(ksocknal_ni);
-                break;
-        case GMNAL:
-                PORTAL_SYMBOL_PUT(kgmnal_ni);
-                break;
-        case IBNAL:
-                PORTAL_SYMBOL_PUT(kibnal_ni);
-                break;
-        case TCPNAL:
-                /* A lesson to a malicious caller */
-                LBUG ();
-        case SCIMACNAL:
-                PORTAL_SYMBOL_PUT(kscimacnal_ni);
-                break;
-        default:
-                CERROR ("unknown nal: %d\n", nal);
-        }
-}
-
-int
-kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private)
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
 {
         int rc = 0;
+        down_read(&ioctl_list_sem);
+        if (!list_empty(&hand->item))
+                rc = -EBUSY;
+        up_read(&ioctl_list_sem);
 
-        CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler);
-
-        if (nal > 0  && nal <= NAL_MAX_NR) {
-                down(&nal_cmd_sem);
-                if (nal_cmd[nal].nch_handler != NULL)
-                        rc = -EBUSY;
-                else {
-                        nal_cmd[nal].nch_handler = handler;
-                        nal_cmd[nal].nch_private = private;
-                }
-                up(&nal_cmd_sem);
+        if (rc == 0) {
+                down_write(&ioctl_list_sem);
+                list_add_tail(&hand->item, &ioctl_list);
+                up_write(&ioctl_list_sem);
         }
-        return rc;
+        RETURN(0);
 }
+EXPORT_SYMBOL(libcfs_register_ioctl);
 
-int
-kportal_nal_unregister(int nal)
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
 {
         int rc = 0;
+        down_read(&ioctl_list_sem);
+        if (list_empty(&hand->item))
+                rc = -ENOENT;
+        up_read(&ioctl_list_sem);
 
-        CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal);
-
-        if (nal > 0  && nal <= NAL_MAX_NR) {
-                down(&nal_cmd_sem);
-                nal_cmd[nal].nch_handler = NULL;
-                nal_cmd[nal].nch_private = NULL;
-                up(&nal_cmd_sem);
+        if (rc == 0) {
+                down_write(&ioctl_list_sem);
+                list_del_init(&hand->item);
+                up_write(&ioctl_list_sem);
         }
-        return rc;
+        RETURN(0);
 }
+EXPORT_SYMBOL(libcfs_deregister_ioctl);
 
-
-static int kportal_ioctl(struct inode *inode, struct file *file,
-                         unsigned int cmd, unsigned long arg)
+static int libcfs_ioctl(struct inode *inode, struct file *file,
+                        unsigned int cmd, unsigned long arg)
 {
-        int err = 0;
+        int err = -EINVAL;
         char buf[1024];
         struct portal_ioctl_data *data;
-        char str[PTL_NALFMT_SIZE];
-
         ENTRY;
 
         if (current->fsuid != 0)
@@ -542,101 +414,67 @@ static int kportal_ioctl(struct inode *inode, struct file *file,
                         RETURN(-EINVAL);
                 portals_debug_mark_buffer(data->ioc_inlbuf1);
                 RETURN(0);
-        case IOC_PORTAL_PING: {
-                void (*ping)(struct portal_ioctl_data *);
-
-                CDEBUG(D_IOCTL, "doing %d pings to nid "LPX64" (%s)\n",
-                       data->ioc_count, data->ioc_nid,
-                       portals_nid2str(data->ioc_nal, data->ioc_nid, str));
-                ping = PORTAL_SYMBOL_GET(kping_client);
-                if (!ping)
-                        CERROR("PORTAL_SYMBOL_GET failed\n");
-                else {
-                        ping(data);
-                        PORTAL_SYMBOL_PUT(kping_client);
-                }
-                RETURN(0);
-        }
-
-        case IOC_PORTAL_GET_NID: {
-                const ptl_handle_ni_t *nip;
-                ptl_process_id_t       pid;
+#if LWT_SUPPORT
+        case IOC_PORTAL_LWT_CONTROL:
+                err = lwt_control (data->ioc_flags, data->ioc_misc);
+                break;
 
-                CDEBUG (D_IOCTL, "Getting nid for nal [%d]\n", data->ioc_nal);
+        case IOC_PORTAL_LWT_SNAPSHOT: {
+                cycles_t   now;
+                int        ncpu;
+                int        total_size;
 
-                nip = kportal_get_ni (data->ioc_nal);
-                if (nip == NULL)
-                        RETURN (-EINVAL);
+                err = lwt_snapshot (&now, &ncpu, &total_size,
+                                    data->ioc_pbuf1, data->ioc_plen1);
+                data->ioc_nid = now;
+                data->ioc_count = ncpu;
+                data->ioc_misc = total_size;
 
-                err = PtlGetId (*nip, &pid);
-                LASSERT (err == PTL_OK);
-                kportal_put_ni (data->ioc_nal);
+                /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */
+                data->ioc_nid2 = sizeof(lwt_event_t);
+                data->ioc_nid3 = offsetof(lwt_event_t, lwte_where);
 
-                data->ioc_nid = pid.nid;
-                if (copy_to_user ((char *)arg, data, sizeof (*data)))
+                if (err == 0 &&
+                    copy_to_user((char *)arg, data, sizeof (*data)))
                         err = -EFAULT;
                 break;
         }
 
+        case IOC_PORTAL_LWT_LOOKUP_STRING:
+                err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
+                                         data->ioc_pbuf2, data->ioc_plen2);
+                if (err == 0 &&
+                    copy_to_user((char *)arg, data, sizeof (*data)))
+                        err = -EFAULT;
+                break;
+#endif
         case IOC_PORTAL_NAL_CMD: {
                 struct portals_cfg pcfg;
 
-                LASSERT (data->ioc_plen1 == sizeof(pcfg));
-                err = copy_from_user(&pcfg, (void *)data->ioc_pbuf1, 
-                                     sizeof(pcfg));
-                if ( err ) {
-                        EXIT;
-                        return err;
+                if (data->ioc_plen1 != sizeof(pcfg)) {
+                        CERROR("Bad ioc_plen1 %d (wanted %d)\n",
+                               data->ioc_plen1, sizeof(pcfg));
+                        err = -EINVAL;
+                        break;
                 }
 
-                CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", pcfg.pcfg_nal,
-                        pcfg.pcfg_command);
-                err = kportal_nal_cmd(&pcfg);
-                if (err == 0) {
-                        if (copy_to_user((char *)data->ioc_pbuf1, &pcfg, 
-                                         sizeof (pcfg)))
-                                err = -EFAULT;
-                        if (copy_to_user((char *)arg, data, sizeof (*data)))
-                                err = -EFAULT;
+                if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1,
+                                   sizeof(pcfg))) {
+                        err = -EFAULT;
+                        break;
                 }
-                break;
-        }
-        case IOC_PORTAL_FAIL_NID: {
-                const ptl_handle_ni_t *nip;
 
-                CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
-                        data->ioc_nal, data->ioc_nid, data->ioc_count);
-
-                nip = kportal_get_ni (data->ioc_nal);
-                if (nip == NULL)
-                        return (-EINVAL);
+                CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", pcfg.pcfg_nal,
+                        pcfg.pcfg_command);
+                err = libcfs_nal_cmd(&pcfg);
 
-                err = PtlFailNid (*nip, data->ioc_nid, data->ioc_count);
-                kportal_put_ni (data->ioc_nal);
-                break;
-        }
-#if LWT_SUPPORT
-        case IOC_PORTAL_LWT_CONTROL: 
-                err = lwt_control (data->ioc_flags, data->ioc_misc);
-                break;
-                
-        case IOC_PORTAL_LWT_SNAPSHOT:
-                err = lwt_snapshot (&data->ioc_nid,
-                                    &data->ioc_count, &data->ioc_misc,
-                                    data->ioc_pbuf1, data->ioc_plen1);
-                if (err == 0 &&
-                    copy_to_user((char *)arg, data, sizeof (*data)))
-                        err = -EFAULT;
-                break;
-                
-        case IOC_PORTAL_LWT_LOOKUP_STRING:
-                err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
-                                         data->ioc_pbuf2, data->ioc_plen2);
                 if (err == 0 &&
-                    copy_to_user((char *)arg, data, sizeof (*data)))
+                    copy_to_user((char *)data->ioc_pbuf1, &pcfg,
+                                 sizeof (pcfg)))
                         err = -EFAULT;
                 break;
-#endif
+        }
+
         case IOC_PORTAL_MEMHOG:
                 if (!capable (CAP_SYS_ADMIN))
                         err = -EPERM;
@@ -652,26 +490,34 @@ static int kportal_ioctl(struct inode *inode, struct file *file,
                 }
                 break;
 
-        default:
+        default: {
+                struct libcfs_ioctl_handler *hand;
                 err = -EINVAL;
-                break;
+                down_read(&ioctl_list_sem);
+                list_for_each_entry(hand, &ioctl_list, item) {
+                        err = hand->handle_ioctl(data, cmd, arg);
+                        if (err != -EINVAL)
+                                break;
+                }
+                up_read(&ioctl_list_sem);
+                } break;
         }
 
         RETURN(err);
 }
 
 
-static struct file_operations portalsdev_fops = {
-        ioctl:   kportal_ioctl,
-        open:    kportal_psdev_open,
-        release: kportal_psdev_release
+static struct file_operations libcfs_fops = {
+        ioctl:   libcfs_ioctl,
+        open:    libcfs_psdev_open,
+        release: libcfs_psdev_release
 };
 
 
-static struct miscdevice portal_dev = {
+static struct miscdevice libcfs_dev = {
         PORTAL_MINOR,
         "portals",
-        &portalsdev_fops
+        &libcfs_fops
 };
 
 extern int insert_proc(void);
@@ -680,7 +526,7 @@ MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
 MODULE_DESCRIPTION("Portals v3.1");
 MODULE_LICENSE("GPL");
 
-static int init_kportals_module(void)
+static int init_libcfs_module(void)
 {
         int rc;
 
@@ -697,41 +543,23 @@ static int init_kportals_module(void)
                 goto cleanup_debug;
         }
 #endif
-        sema_init(&nal_cmd_sem, 1);
-
-        rc = misc_register(&portal_dev);
+        rc = misc_register(&libcfs_dev);
         if (rc) {
                 CERROR("misc_register: error %d\n", rc);
                 goto cleanup_lwt;
         }
 
-        rc = PtlInit();
-        if (rc) {
-                CERROR("PtlInit: error %d\n", rc);
-                goto cleanup_deregister;
-        }
-
         rc = insert_proc();
         if (rc) {
                 CERROR("insert_proc: error %d\n", rc);
-                goto cleanup_fini;
-        }
-
-        rc = kportal_nal_register(ROUTER, kportal_router_cmd, NULL);
-        if (rc) {
-                CERROR("kportal_nal_registre: ROUTER error %d\n", rc);
-                goto cleanup_proc;
+                goto cleanup_deregister;
         }
 
         CDEBUG (D_OTHER, "portals setup OK\n");
         return (0);
 
- cleanup_proc:
-        remove_proc();
- cleanup_fini:
-        PtlFini();
  cleanup_deregister:
-        misc_deregister(&portal_dev);
+        misc_deregister(&libcfs_dev);
  cleanup_lwt:
 #if LWT_SUPPORT
         lwt_fini();
@@ -741,18 +569,16 @@ static int init_kportals_module(void)
         return rc;
 }
 
-static void exit_kportals_module(void)
+static void exit_libcfs_module(void)
 {
         int rc;
 
-        kportal_nal_unregister(ROUTER);
         remove_proc();
-        PtlFini();
 
         CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
                atomic_read(&portal_kmemory));
 
-        rc = misc_deregister(&portal_dev);
+        rc = misc_deregister(&libcfs_dev);
         if (rc)
                 CERROR("misc_deregister error %d\n", rc);
 
@@ -769,48 +595,9 @@ static void exit_kportals_module(void)
                 printk(KERN_ERR "LustreError: portals_debug_cleanup: %d\n", rc);
 }
 
-EXPORT_SYMBOL(lib_dispatch);
-EXPORT_SYMBOL(PtlMEAttach);
-EXPORT_SYMBOL(PtlMEInsert);
-EXPORT_SYMBOL(PtlMEUnlink);
-EXPORT_SYMBOL(PtlEQAlloc);
-EXPORT_SYMBOL(PtlMDAttach);
-EXPORT_SYMBOL(PtlMDUnlink);
-EXPORT_SYMBOL(PtlNIInit);
-EXPORT_SYMBOL(PtlNIFini);
-EXPORT_SYMBOL(PtlNIDebug);
-EXPORT_SYMBOL(PtlInit);
-EXPORT_SYMBOL(PtlFini);
-EXPORT_SYMBOL(PtlPut);
-EXPORT_SYMBOL(PtlGet);
-EXPORT_SYMBOL(ptl_err_str);
-EXPORT_SYMBOL(PtlEQWait);
-EXPORT_SYMBOL(PtlEQFree);
-EXPORT_SYMBOL(PtlEQGet);
-EXPORT_SYMBOL(PtlGetId);
-EXPORT_SYMBOL(PtlMDBind);
-EXPORT_SYMBOL(lib_iov_nob);
-EXPORT_SYMBOL(lib_copy_iov2buf);
-EXPORT_SYMBOL(lib_copy_buf2iov);
-EXPORT_SYMBOL(lib_extract_iov);
-EXPORT_SYMBOL(lib_kiov_nob);
-EXPORT_SYMBOL(lib_copy_kiov2buf);
-EXPORT_SYMBOL(lib_copy_buf2kiov);
-EXPORT_SYMBOL(lib_extract_kiov);
-EXPORT_SYMBOL(lib_finalize);
-EXPORT_SYMBOL(lib_parse);
-EXPORT_SYMBOL(lib_fake_reply_msg);
-EXPORT_SYMBOL(lib_init);
-EXPORT_SYMBOL(lib_fini);
-EXPORT_SYMBOL(dispatch_name);
 EXPORT_SYMBOL(kportal_daemonize);
 EXPORT_SYMBOL(kportal_blockallsigs);
-EXPORT_SYMBOL(kportal_nal_register);
-EXPORT_SYMBOL(kportal_nal_unregister);
 EXPORT_SYMBOL(kportal_assertion_failed);
-EXPORT_SYMBOL(kportal_get_ni);
-EXPORT_SYMBOL(kportal_put_ni);
-EXPORT_SYMBOL(kportal_nal_cmd);
 
-module_init(init_kportals_module);
-module_exit (exit_kportals_module);
+module_init(init_libcfs_module);
+module_exit(exit_libcfs_module);
index 4b39902..08446a0 100644 (file)
@@ -62,16 +62,18 @@ extern char debug_file_path[1024];
 extern char portals_upcall[1024];
 
 #define PSDEV_PORTALS  (0x100)
-#define PSDEV_DEBUG           1   /* control debugging */
-#define PSDEV_SUBSYSTEM_DEBUG 2   /* control debugging */
-#define PSDEV_PRINTK          3   /* force all errors to console */
-#define PSDEV_CONSOLE         4   /* allow _any_ messages to console */
-#define PSDEV_DEBUG_PATH      5   /* crashdump log location */
-#define PSDEV_DEBUG_DUMP_PATH 6   /* crashdump tracelog location */
-#define PSDEV_PORTALS_UPCALL  7   /* User mode upcall script  */
-
-#define PORTALS_PRIMARY_CTLCNT 7
-static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = {
+enum {
+        PSDEV_DEBUG = 1,          /* control debugging */
+        PSDEV_SUBSYSTEM_DEBUG,    /* control debugging */
+        PSDEV_PRINTK,             /* force all errors to console */
+        PSDEV_CONSOLE,            /* allow _any_ messages to console */
+        PSDEV_DEBUG_PATH,         /* crashdump log location */
+        PSDEV_DEBUG_DUMP_PATH,    /* crashdump tracelog location */
+        PSDEV_PORTALS_UPCALL,     /* User mode upcall script  */
+        PSDEV_PORTALS_MEMUSED,    /* bytes currently PORTAL_ALLOCated */
+};
+
+static struct ctl_table portals_table[] = {
         {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL,
          &proc_dointvec},
         {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug,
@@ -83,6 +85,8 @@ static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = {
         {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall,
          sizeof(portals_upcall), 0644, NULL, &proc_dostring,
          &sysctl_string},
+        {PSDEV_PORTALS_MEMUSED, "memused", (int *)&portal_kmemory.counter,
+         sizeof(int), 0644, NULL, &proc_dointvec},
         {0}
 };
 
index 71067ac..c0f2e71 100644 (file)
@@ -1,6 +1,6 @@
-#MODULES := portals
-#portals-objs := api-eq.o api-init.o api-me.o api-errno.o api-ni.o api-wrap.o
-#portals-objs += lib-dispatch.o lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o
-#portals-objs += lib-move.o lib-ni.o lib-pid.o
+MODULES := portals
+portals-objs := api-errno.o api-ni.o api-wrap.o
+portals-objs += lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o
+portals-objs += lib-move.o lib-ni.o lib-pid.o module.o
 
 @INCLUDE_RULES@
index de01765..088902a 100644 (file)
@@ -6,7 +6,7 @@
 include $(src)/../Kernelenv
 
 obj-y += portals.o
-portals-objs    :=     lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o \
+portals-objs    :=     lib-eq.o lib-init.o lib-md.o lib-me.o \
                        lib-move.o lib-msg.o lib-ni.o lib-pid.o \
-                       api-eq.o api-errno.o api-init.o api-me.o api-ni.o \
-                       api-wrap.o module.o
+                       api-errno.o api-ni.o api-wrap.o \
+                       module.o
diff --git a/lustre/portals/portals/api-eq.c b/lustre/portals/portals/api-eq.c
deleted file mode 100644 (file)
index 964b9d8..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * api/api-eq.c
- * User-level event queue management routines
- *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *  Copyright (c) 2001-2002 Sandia National Laboratories
- *
- *   This file is part of Lustre, http://www.sf.net/projects/lustre/
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <portals/api-support.h>
-
-int ptl_eq_init(void)
-{
-        /* Nothing to do anymore... */
-        return PTL_OK;
-}
-
-void ptl_eq_fini(void)
-{
-        /* Nothing to do anymore... */
-}
-
-int ptl_eq_ni_init(nal_t * nal)
-{
-        /* Nothing to do anymore... */
-        return PTL_OK;
-}
-
-void ptl_eq_ni_fini(nal_t * nal)
-{
-        /* Nothing to do anymore... */
-}
-
-int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev)
-{
-        ptl_eq_t *eq;
-        int rc, new_index;
-        unsigned long flags;
-        ptl_event_t *new_event;
-        nal_t *nal;
-        ENTRY;
-
-        if (!ptl_init)
-                RETURN(PTL_NOINIT);
-
-        nal = ptl_hndl2nal(&eventq);
-        if (!nal)
-                RETURN(PTL_INV_EQ);
-
-        eq = ptl_handle2usereq(&eventq);
-        nal->lock(nal, &flags);
-
-        /* size must be a power of 2 to handle a wrapped sequence # */
-        LASSERT (eq->size != 0 &&
-                 eq->size == LOWEST_BIT_SET (eq->size));
-
-        new_index = eq->sequence & (eq->size - 1);
-        new_event = &eq->base[new_index];
-        CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n",
-               new_event, eq->sequence, eq->size);
-        if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) {
-                nal->unlock(nal, &flags);
-                RETURN(PTL_EQ_EMPTY);
-        }
-
-        *ev = *new_event;
-
-        /* ensure event is delivered correctly despite possible 
-           races with lib_finalize */
-        if (eq->sequence != new_event->sequence) {
-                CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n",
-                       eq->sequence, new_event->sequence);
-                rc = PTL_EQ_DROPPED;
-        } else {
-                rc = PTL_OK;
-        }
-
-        eq->sequence = new_event->sequence + 1;
-        nal->unlock(nal, &flags);
-        RETURN(rc);
-}
-
-
-int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out)
-{
-        int rc;
-        
-        /* PtlEQGet does the handle checking */
-        while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) {
-                nal_t *nal = ptl_hndl2nal(&eventq_in);
-                
-                if (nal->yield)
-                        nal->yield(nal);
-        }
-
-        return rc;
-}
-
-#ifndef __KERNEL__
-#if 0
-static jmp_buf eq_jumpbuf;
-
-static void eq_timeout(int signal)
-{
-        sigset_t set;
-
-        /* signal will be automatically disabled in sig handler,
-         * must enable it before long jump
-         */
-        sigemptyset(&set);
-        sigaddset(&set, SIGALRM);
-        sigprocmask(SIG_UNBLOCK, &set, NULL);
-
-        longjmp(eq_jumpbuf, -1);
-}
-
-int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
-                      int timeout)
-{
-        static void (*prev) (int) = NULL;
-        static int left_over;
-        time_t time_at_start;
-        int rc;
-
-        if (setjmp(eq_jumpbuf)) {
-                signal(SIGALRM, prev);
-                alarm(left_over - timeout);
-                return PTL_EQ_EMPTY;
-        }
-
-        left_over = alarm(timeout);
-        prev = signal(SIGALRM, eq_timeout);
-        time_at_start = time(NULL);
-        if (left_over && left_over < timeout)
-                alarm(left_over);
-
-        rc = PtlEQWait(eventq_in, event_out);
-
-        signal(SIGALRM, prev);
-        alarm(left_over);       /* Should compute how long we waited */
-
-        return rc;
-}
-#else
-#include <errno.h>
-
-/* FIXME
- * Here timeout need a trick with tcpnal, definitely unclean but OK for
- * this moment.
- */
-
-/* global variables defined by tcpnal */
-extern int __tcpnal_eqwait_timeout_value;
-extern int __tcpnal_eqwait_timedout;
-
-int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
-                      int timeout)
-{
-        int rc;
-
-        if (!timeout)
-                return PtlEQWait(eventq_in, event_out);
-
-        __tcpnal_eqwait_timeout_value = timeout;
-
-        while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) {
-                nal_t *nal = ptl_hndl2nal(&eventq_in);
-                
-                if (nal->yield)
-                        nal->yield(nal);
-
-                if (__tcpnal_eqwait_timedout) {
-                        if (__tcpnal_eqwait_timedout != ETIMEDOUT)
-                                printf("Warning: yield return error %d\n",
-                                        __tcpnal_eqwait_timedout);
-                        rc = PTL_EQ_EMPTY;
-                        break;
-                }
-        }
-
-        __tcpnal_eqwait_timeout_value = 0;
-
-        return rc;
-}
-#endif
-#endif /* __KERNEL__ */
index b5e7aa1..9a4e5ac 100644 (file)
@@ -12,43 +12,37 @@ const char *ptl_err_str[] = {
         "PTL_OK",
         "PTL_SEGV",
 
-        "PTL_NOSPACE",
-        "PTL_INUSE",
+        "PTL_NO_SPACE",
+        "PTL_ME_IN_USE",
         "PTL_VAL_FAILED",
 
         "PTL_NAL_FAILED",
-        "PTL_NOINIT",
-        "PTL_INIT_DUP",
-        "PTL_INIT_INV",
-        "PTL_AC_INV_INDEX",
-
-        "PTL_INV_ASIZE",
-        "PTL_INV_HANDLE",
-        "PTL_INV_MD",
-        "PTL_INV_ME",
-        "PTL_INV_NI",
+        "PTL_NO_INIT",
+        "PTL_IFACE_DUP",
+        "PTL_IFACE_INVALID",
+
+        "PTL_HANDLE_INVALID",
+        "PTL_MD_INVALID",
+        "PTL_ME_INVALID",
 /* If you change these, you must update the number table in portals/errno.h */
-        "PTL_ILL_MD",
-        "PTL_INV_PROC",
-        "PTL_INV_PSIZE",
-        "PTL_INV_PTINDEX",
-        "PTL_INV_REG",
-
-        "PTL_INV_SR_INDX",
-        "PTL_ML_TOOLONG",
-        "PTL_ADDR_UNKNOWN",
-        "PTL_INV_EQ",
+        "PTL_PROCESS_INVALID",
+        "PTL_PT_INDEX_INVALID",
+
+        "PTL_SR_INDEX_INVALID",
+        "PTL_EQ_INVALID",
         "PTL_EQ_DROPPED",
 
         "PTL_EQ_EMPTY",
-        "PTL_NOUPDATE",
+        "PTL_MD_NO_UPDATE",
         "PTL_FAIL",
-        "PTL_NOT_IMPLEMENTED",
-        "PTL_NO_ACK",
 
-        "PTL_IOV_TOO_MANY",
-        "PTL_IOV_TOO_SMALL",
+        "PTL_IOV_INVALID",
+
+        "PTL_EQ_IN_USE",
+
+        "PTL_NI_INVALID",
+        "PTL_MD_ILLEGAL",
 
-        "PTL_EQ_INUSE",
+        "PTL_MAX_ERRNO"
 };
 /* If you change these, you must update the number table in portals/errno.h */
diff --git a/lustre/portals/portals/api-init.c b/lustre/portals/portals/api-init.c
deleted file mode 100644 (file)
index 0a64864..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * api/api-init.c
- * Initialization and global data for the p30 user side library
- *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *  Copyright (c) 2001-2002 Sandia National Laboratories
- *
- *   This file is part of Lustre, http://www.sf.net/projects/lustre/
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <portals/api-support.h>
-
-int ptl_init;
-
-int __p30_initialized;
-int __p30_myr_initialized;
-int __p30_ip_initialized;
-ptl_handle_ni_t __myr_ni_handle;
-ptl_handle_ni_t __ip_ni_handle;
-
-int __p30_myr_timeout = 10;
-int __p30_ip_timeout;
-
-int PtlInit(void)
-{
-
-        if (ptl_init)
-                return PTL_OK;
-
-        ptl_ni_init();
-        ptl_me_init();
-        ptl_eq_init();
-        ptl_init = 1;
-        __p30_initialized = 1;
-
-        return PTL_OK;
-}
-
-
-void PtlFini(void)
-{
-
-        /* Reverse order of initialization */
-        ptl_eq_fini();
-        ptl_me_fini();
-        ptl_ni_fini();
-        ptl_init = 0;
-}
diff --git a/lustre/portals/portals/api-me.c b/lustre/portals/portals/api-me.c
deleted file mode 100644 (file)
index e724e58..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * api/api-me.c
- * Match Entry local operations.
- *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *  Copyright (c) 2001-2002 Sandia National Laboratories
- *
- *   This file is part of Lustre, http://www.sf.net/projects/lustre/
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <portals/api-support.h>
-
-int ptl_me_init(void)
-{
-        return PTL_OK;
-}
-void ptl_me_fini(void)
-{                                /* Nothing to do */
-}
-int ptl_me_ni_init(nal_t * nal)
-{
-        return PTL_OK;
-}
-
-void ptl_me_ni_fini(nal_t * nal)
-{                                /* Nothing to do... */
-}
index 18eea91..72d3b41 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define DEBUG_SUBSYSTEM S_PORTALS
 #include <portals/api-support.h>
 
+int ptl_init;
+
 /* Put some magic in the NI handle so uninitialised/zeroed handles are easy
  * to spot */
 #define NI_HANDLE_MAGIC  0xebc0de00
 #define NI_HANDLE_MASK   0x000000ff
-#define MAX_NIS          8         
-static nal_t *ptl_interfaces[MAX_NIS];
-int ptl_num_interfaces = 0;
+
+static struct nal_t *ptl_nal_table[NAL_MAX_NR + 1];
+
+#ifdef __KERNEL__
+DECLARE_MUTEX(ptl_mutex);
+
+static void ptl_mutex_enter (void) 
+{
+        down (&ptl_mutex);
+}
+
+static void ptl_mutex_exit (void)
+{
+        up (&ptl_mutex);
+}
+#else
+static void ptl_mutex_enter (void)
+{
+}
+
+static void ptl_mutex_exit (void) 
+{
+}
+#endif
 
 nal_t *ptl_hndl2nal(ptl_handle_any_t *handle)
 {
@@ -42,156 +66,200 @@ nal_t *ptl_hndl2nal(ptl_handle_any_t *handle)
          * invalidated out from under her (or worse, swapped for a
          * completely different interface!) */
 
+        LASSERT (ptl_init);
+
         if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0)
                 return NULL;
 
         idx &= NI_HANDLE_MASK;
-        if (idx < MAX_NIS)
-                return ptl_interfaces[idx];
+        
+        if (idx > NAL_MAX_NR ||
+            ptl_nal_table[idx] == NULL ||
+            ptl_nal_table[idx]->nal_refct == 0)
+                return NULL;
 
-        return NULL;
+        return ptl_nal_table[idx];
 }
 
-int ptl_ni_init(void)
+int ptl_register_nal (ptl_interface_t interface, nal_t *nal)
 {
-        int i;
-
-        LASSERT (MAX_NIS <= (NI_HANDLE_MASK + 1));
+        int    rc;
         
-        for (i = 0; i < MAX_NIS; i++)
-                ptl_interfaces[i] = NULL;
+        ptl_mutex_enter();
+        
+        if (interface < 0 || interface > NAL_MAX_NR)
+                rc = PTL_IFACE_INVALID;
+        else if (ptl_nal_table[interface] != NULL)
+                rc = PTL_IFACE_DUP;
+        else {
+                rc = PTL_OK;
+                ptl_nal_table[interface] = nal;
+                LASSERT(nal->nal_refct == 0);
+        }
 
-        return PTL_OK;
+        ptl_mutex_exit();
+        return (rc);
 }
 
-void ptl_ni_fini(void)
+void ptl_unregister_nal (ptl_interface_t interface)
 {
-        int i;
-
-        for (i = 0; i < MAX_NIS; i++) {
-                nal_t *nal = ptl_interfaces[i];
-                if (!nal)
-                        continue;
+        LASSERT(interface >= 0 && interface <= NAL_MAX_NR);
+        LASSERT(ptl_nal_table[interface] != NULL);
+        LASSERT(ptl_nal_table[interface]->nal_refct == 0);
+        
+        ptl_mutex_enter();
+        
+        ptl_nal_table[interface] = NULL;
 
-                if (nal->shutdown)
-                        nal->shutdown(nal, i);
-        }
+        ptl_mutex_exit();
 }
 
-#ifdef __KERNEL__
-DECLARE_MUTEX(ptl_ni_init_mutex);
-
-static void ptl_ni_init_mutex_enter (void) 
+int PtlInit(int *max_interfaces)
 {
-        down (&ptl_ni_init_mutex);
-}
+        LASSERT(!strcmp(ptl_err_str[PTL_MAX_ERRNO], "PTL_MAX_ERRNO"));
 
-static void ptl_ni_init_mutex_exit (void)
-{
-        up (&ptl_ni_init_mutex);
-}
+        /* If this assertion fails, we need more bits in NI_HANDLE_MASK and
+         * to shift NI_HANDLE_MAGIC left appropriately */
+        LASSERT (NAL_MAX_NR < (NI_HANDLE_MASK + 1));
+        
+        if (max_interfaces != NULL)
+                *max_interfaces = NAL_MAX_NR + 1;
+
+        ptl_mutex_enter();
+
+        if (!ptl_init) {
+                /* NULL pointers, clear flags */
+                memset(ptl_nal_table, 0, sizeof(ptl_nal_table));
+#ifndef __KERNEL__
+                /* Kernel NALs register themselves when their module loads,
+                 * and unregister themselves when their module is unloaded.
+                 * Userspace NALs, are plugged in explicitly here... */
+                {
+                        extern nal_t procapi_nal;
+
+                        /* XXX pretend it's socknal to keep liblustre happy... */
+                        ptl_nal_table[SOCKNAL] = &procapi_nal;
+                        LASSERT (procapi_nal.nal_refct == 0);
+                }
+#endif
+                ptl_init = 1;
+        }
 
-#else
-static void ptl_ni_init_mutex_enter (void)
-{
+        ptl_mutex_exit();
+        
+        return PTL_OK;
 }
 
-static void ptl_ni_init_mutex_exit (void) 
+void PtlFini(void)
 {
-}
+        nal_t  *nal;
+        int     i;
+
+        ptl_mutex_enter();
+
+        if (ptl_init) {
+                for (i = 0; i <= NAL_MAX_NR; i++) {
+
+                        nal = ptl_nal_table[i];
+                        if (nal == NULL)
+                                continue;
+                        
+                        if (nal->nal_refct != 0) {
+                                CWARN("NAL %d has outstanding refcount %d\n",
+                                      i, nal->nal_refct);
+                                nal->nal_ni_fini(nal);
+                        }
+                        
+                        ptl_nal_table[i] = NULL;
+                }
 
-#endif
+                ptl_init = 0;
+        }
+        
+        ptl_mutex_exit();
+}
 
-int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size,
-              ptl_ac_index_t acl_size, ptl_pid_t requested_pid,
-              ptl_handle_ni_t * handle)
+int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid,
+              ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits,
+              ptl_handle_ni_t *handle)
 {
         nal_t *nal;
-        int i;
+        int    i;
+        int    rc;
 
         if (!ptl_init)
-                return PTL_NOINIT;
-
-        ptl_ni_init_mutex_enter ();
-
-        nal = interface(ptl_num_interfaces, ptl_size, acl_size, requested_pid);
-
-        if (!nal) {
-                ptl_ni_init_mutex_exit ();
-                return PTL_NAL_FAILED;
+                return PTL_NO_INIT;
+
+        ptl_mutex_enter ();
+
+        if (interface == PTL_IFACE_DEFAULT) {
+                for (i = 0; i <= NAL_MAX_NR; i++)
+                        if (ptl_nal_table[i] != NULL) {
+                                interface = i;
+                                break;
+                        }
+                /* NB if no interfaces are registered, 'interface' will
+                 * fail the valid test below */
         }
-
-        for (i = 0; i < ptl_num_interfaces; i++) {
-                if (ptl_interfaces[i] == nal) {
-                        nal->refct++;
-                        handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i;
-                        CDEBUG(D_OTHER, "Returning existing NAL (%d)\n", i);
-                        ptl_ni_init_mutex_exit ();
-                        return PTL_OK;
-                }
+        
+        if (interface < 0 || 
+            interface > NAL_MAX_NR ||
+            ptl_nal_table[interface] == NULL) {
+                GOTO(out, rc = PTL_IFACE_INVALID);
         }
-        nal->refct = 1;
 
-        if (ptl_num_interfaces >= MAX_NIS) {
-                if (nal->shutdown)
-                        nal->shutdown (nal, ptl_num_interfaces);
-                ptl_ni_init_mutex_exit ();
-                return PTL_NOSPACE;
-        }
+        nal = ptl_nal_table[interface];
+        nal->nal_handle.nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | interface;
+        nal->nal_handle.cookie = 0;
+        
+        CDEBUG(D_OTHER, "Starting up NAL (%d) refs %d\n", interface, nal->nal_refct);
+        rc = nal->nal_ni_init(nal, requested_pid, desired_limits, actual_limits);
 
-        handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | ptl_num_interfaces;
-        ptl_interfaces[ptl_num_interfaces++] = nal;
+        if (rc != PTL_OK) {
+                CERROR("Error %d starting up NAL %d, refs %d\n", rc,
+                       interface, nal->nal_refct);
+                GOTO(out, rc);
+        }
+        
+        if (nal->nal_refct != 0) {
+                /* Caller gets to know if this was the first ref or not */
+                rc = PTL_IFACE_DUP;
+        }
+        
+        nal->nal_refct++;
+        *handle = nal->nal_handle;
 
-        ptl_eq_ni_init(nal);
-        ptl_me_ni_init(nal);
+ out:
+        ptl_mutex_exit ();
 
-        ptl_ni_init_mutex_exit ();
-        return PTL_OK;
+        return rc;
 }
 
-
 int PtlNIFini(ptl_handle_ni_t ni)
 {
         nal_t *nal;
-        int idx;
-        int rc;
+        int    idx;
 
         if (!ptl_init)
-                return PTL_NOINIT;
+                return PTL_NO_INIT;
 
-        ptl_ni_init_mutex_enter ();
+        ptl_mutex_enter ();
 
         nal = ptl_hndl2nal (&ni);
         if (nal == NULL) {
-                ptl_ni_init_mutex_exit ();
-                return PTL_INV_HANDLE;
+                ptl_mutex_exit ();
+                return PTL_HANDLE_INVALID;
         }
 
         idx = ni.nal_idx & NI_HANDLE_MASK;
 
-        nal->refct--;
-        if (nal->refct > 0) {
-                ptl_ni_init_mutex_exit ();
-                return PTL_OK;
-        }
-
-        ptl_me_ni_fini(nal);
-        ptl_eq_ni_fini(nal);
-
-        rc = PTL_OK;
-        if (nal->shutdown)
-                rc = nal->shutdown(nal, idx);
+        LASSERT(nal->nal_refct > 0);
 
-        ptl_interfaces[idx] = NULL;
-        ptl_num_interfaces--;
+        nal->nal_refct--;
 
-        ptl_ni_init_mutex_exit ();
-        return rc;
-}
-
-int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out)
-{
-        *ni_out = handle_in;
+        /* nal_refct == 0 tells nal->shutdown to really shut down */
+        nal->nal_ni_fini(nal);
 
+        ptl_mutex_exit ();
         return PTL_OK;
 }
index d23a6aa..37f6c0b 100644 (file)
 # define DEBUG_SUBSYSTEM S_PORTALS
 #include <portals/api-support.h>
 
-static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf,
-                      int argsize, void *retbuf, int retsize)
+void PtlSnprintHandle(char *str, int len, ptl_handle_any_t h)
 {
-        nal_t *nal;
-
-        if (!ptl_init) {
-                CERROR("Not initialized\n");
-                return PTL_NOINIT;
-        }
-
-        nal = ptl_hndl2nal(&any_h);
-        if (!nal)
-                return PTL_INV_HANDLE;
-
-        nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize);
+        snprintf(str, len, "0x%lx."LPX64, h.nal_idx, h.cookie);
+}
 
+int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t *ni_out)
+{
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        if (ptl_hndl2nal(&handle_in) == NULL)
+                return PTL_HANDLE_INVALID;
+        
+        *ni_out = handle_in;
         return PTL_OK;
 }
 
 int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id)
 {
-        PtlGetId_in args;
-        PtlGetId_out ret;
-        int rc;
-
-        args.handle_in = ni_handle;
+        nal_t     *nal;
 
-        rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret,
-                        sizeof(ret));
-        if (rc != PTL_OK)
-                return rc;
+        if (!ptl_init)
+                return PTL_NO_INIT;
         
-        if (id)
-                *id = ret.id_out;
+        nal = ptl_hndl2nal(&ni_handle);
+        if (nal == NULL)
+                return PTL_NI_INVALID;
 
-        return ret.rc;
+        return nal->nal_get_id(nal, id);
 }
 
-int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) 
+int PtlGetUid(ptl_handle_ni_t ni_handle, ptl_uid_t *uid)
 {
-        PtlFailNid_in  args;
-        PtlFailNid_out ret;
-        int            rc;
-        
-        args.interface = interface;
-        args.nid       = nid;
-        args.threshold = threshold;
+        nal_t     *nal;
+
+        if (!ptl_init)
+                return PTL_NO_INIT;
         
-        rc = do_forward (interface, PTL_FAILNID, 
-                         &args, sizeof(args), &ret, sizeof (ret));
+        nal = ptl_hndl2nal(&ni_handle);
+        if (nal == NULL)
+                return PTL_NI_INVALID;
 
-        return ((rc != PTL_OK) ? rc : ret.rc);
+        /* We don't support different uids yet */
+        *uid = 0;
+        return PTL_OK;
 }
 
-int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
-                ptl_sr_value_t * status_out)
+int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) 
 {
-        PtlNIStatus_in args;
-        PtlNIStatus_out ret;
-        int rc;
-
-        args.interface_in = interface_in;
-        args.register_in = register_in;
-
-        rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret,
-                        sizeof(ret));
-
-        if (rc != PTL_OK)
-                return rc;
+        nal_t     *nal;
 
-        if (status_out)
-                *status_out = ret.status_out;
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&interface);
+        if (nal == NULL)
+                return PTL_NI_INVALID;
 
-        return ret.rc;
+        return nal->nal_fail_nid(nal, nid, threshold);
 }
 
-int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
-              unsigned long *distance_out)
+int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
+                ptl_sr_value_t *status_out)
 {
-        PtlNIDist_in args;
-        PtlNIDist_out ret;
-        int rc;
-
-        args.interface_in = interface_in;
-        args.process_in = process_in;
-
-        rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret,
-                        sizeof(ret));
+        nal_t     *nal;
 
-        if (rc != PTL_OK)
-                return rc;
-
-        if (distance_out)
-                *distance_out = ret.distance_out;
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&interface_in);
+        if (nal == NULL)
+                return PTL_NI_INVALID;
 
-        return ret.rc;
+        return nal->nal_ni_status(nal, register_in, status_out);
 }
 
-
-
-unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in)
+int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
+              unsigned long *distance_out)
 {
-        PtlNIDebug_in args;
-        PtlNIDebug_out ret;
-        int rc;
-
-        args.mask_in = mask_in;
+        nal_t     *nal;
 
-        rc = do_forward(ni, PTL_NIDEBUG, &args, sizeof(args), &ret,
-                        sizeof(ret));
-
-        if (rc != PTL_OK)
-                return rc;
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&interface_in);
+        if (nal == NULL)
+                return PTL_NI_INVALID;
 
-        return ret.rc;
+        return nal->nal_ni_dist(nal, &process_in, distance_out);
 }
 
 int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
                 ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
                 ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
-                ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out)
+                ptl_ins_pos_t pos_in, ptl_handle_me_t *handle_out)
 {
-        PtlMEAttach_in args;
-        PtlMEAttach_out ret;
-        int rc;
-
-        args.interface_in = interface_in;
-        args.index_in = index_in;
-        args.match_id_in = match_id_in;
-        args.match_bits_in = match_bits_in;
-        args.ignore_bits_in = ignore_bits_in;
-        args.unlink_in = unlink_in;
-        args.position_in = pos_in;
-
-        rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret,
-                        sizeof(ret));
-
-        if (rc != PTL_OK)
-                return rc;
-
-        if (handle_out) {
-                handle_out->nal_idx = interface_in.nal_idx;
-                handle_out->cookie = ret.handle_out.cookie;
-        }
-
-        return ret.rc;
+        nal_t     *nal;
+
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&interface_in);
+        if (nal == NULL)
+                return PTL_NI_INVALID;
+
+        return nal->nal_me_attach(nal, index_in, match_id_in, 
+                                  match_bits_in, ignore_bits_in,
+                                  unlink_in, pos_in, handle_out);
 }
 
 int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
@@ -179,421 +141,226 @@ int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
                 ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
                 ptl_handle_me_t * handle_out)
 {
-        PtlMEInsert_in args;
-        PtlMEInsert_out ret;
-        int rc;
-
-        args.current_in = current_in;
-        args.match_id_in = match_id_in;
-        args.match_bits_in = match_bits_in;
-        args.ignore_bits_in = ignore_bits_in;
-        args.unlink_in = unlink_in;
-        args.position_in = position_in;
-
-        rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret,
-                        sizeof(ret));
-
-        if (rc != PTL_OK)
-                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
-
-        if (handle_out) {
-                handle_out->nal_idx = current_in.nal_idx;
-                handle_out->cookie = ret.handle_out.cookie;
-        }
-        return ret.rc;
+        nal_t     *nal;
+
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&current_in);
+        if (nal == NULL)
+                return PTL_ME_INVALID;
+
+        return nal->nal_me_insert(nal, &current_in, match_id_in,
+                                  match_bits_in, ignore_bits_in,
+                                  unlink_in, position_in, handle_out);
 }
 
 int PtlMEUnlink(ptl_handle_me_t current_in)
 {
-        PtlMEUnlink_in args;
-        PtlMEUnlink_out ret;
-        int rc;
-
-        args.current_in = current_in;
-        args.unlink_in = PTL_RETAIN;
-
-        rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret,
-                        sizeof(ret));
+        nal_t     *nal;
 
-        if (rc != PTL_OK)
-                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&current_in);
+        if (nal == NULL)
+                return PTL_ME_INVALID;
 
-        return ret.rc;
+        return nal->nal_me_unlink(nal, &current_in);
 }
 
-int PtlTblDump(ptl_handle_ni_t ni, int index_in)
+int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in,
+                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out)
 {
-        PtlTblDump_in args;
-        PtlTblDump_out ret;
-        int rc;
+        nal_t     *nal;
 
-        args.index_in = index_in;
-
-        rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret,
-                        sizeof(ret));
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&me_in);
+        if (nal == NULL)
+                return PTL_ME_INVALID;
 
-        if (rc != PTL_OK)
-                return rc;
+        if (!PtlHandleIsEqual(md_in.eq_handle, PTL_EQ_NONE) &&
+            ptl_hndl2nal(&md_in.eq_handle) != nal)
+                return PTL_MD_ILLEGAL;
 
-        return ret.rc;
+        return (nal->nal_md_attach)(nal, &me_in, &md_in, 
+                                    unlink_in, handle_out);
 }
 
-int PtlMEDump(ptl_handle_me_t current_in)
+int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
+              ptl_unlink_t unlink_in, ptl_handle_md_t *handle_out)
 {
-        PtlMEDump_in args;
-        PtlMEDump_out ret;
-        int rc;
-
-        args.current_in = current_in;
+        nal_t     *nal;
 
-        rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret,
-                        sizeof(ret));
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&ni_in);
+        if (nal == NULL)
+                return PTL_NI_INVALID;
 
-        if (rc != PTL_OK)
-                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+        if (!PtlHandleIsEqual(md_in.eq_handle, PTL_EQ_NONE) &&
+            ptl_hndl2nal(&md_in.eq_handle) != nal)
+                return PTL_MD_ILLEGAL;
 
-        return ret.rc;
+        return (nal->nal_md_bind)(nal, &md_in, unlink_in, handle_out);
 }
 
-static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in)
+int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout,
+                ptl_md_t *new_inout, ptl_handle_eq_t testq_in)
 {
-        nal_t *nal;
-        int rc;
-        int i;
+        nal_t    *nal;
+        
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&md_in);
+        if (nal == NULL)
+                return PTL_MD_INVALID;
 
-        if (!ptl_init) {
-                CERROR("PtlMDAttach/Bind/Update: Not initialized\n");
-                return PTL_NOINIT;
-        }
+        if (!PtlHandleIsEqual(testq_in, PTL_EQ_NONE) &&
+            ptl_hndl2nal(&testq_in) != nal)
+                return PTL_EQ_INVALID;
 
-        nal = ptl_hndl2nal(&current_in);
-        if (!nal)
-                return PTL_INV_HANDLE;
-
-        if (nal->validate != NULL)                /* nal->validate not a NOOP */
-        {
-                if ((md_in.options & PTL_MD_IOV) == 0)        /* contiguous */
-                {
-                        rc = nal->validate (nal, md_in.start, md_in.length);
-                        if (rc)
-                                return (PTL_SEGV);
-                }
-                else
-                {
-                        struct iovec *iov = (struct iovec *)md_in.start;
-
-                        for (i = 0; i < md_in.niov; i++, iov++)
-                        {
-                                rc = nal->validate (nal, iov->iov_base, iov->iov_len);
-                                if (rc)
-                                        return (PTL_SEGV);
-                        }
-                }
-        }
-
-        return 0;
+        return (nal->nal_md_update)(nal, &md_in, 
+                                    old_inout, new_inout, &testq_in);
 }
 
-static ptl_handle_eq_t md2eq (ptl_md_t *md)
+int PtlMDUnlink(ptl_handle_md_t md_in)
 {
-        if (PtlHandleEqual (md->eventq, PTL_EQ_NONE))
-                return (PTL_EQ_NONE);
+        nal_t    *nal;
+        
+        if (!ptl_init)
+                return PTL_NO_INIT;
         
-        return (ptl_handle2usereq (&md->eventq)->cb_eq_handle);
+        nal = ptl_hndl2nal(&md_in);
+        if (nal == NULL)
+                return PTL_MD_INVALID;
+        
+        return (nal->nal_md_unlink)(nal, &md_in);
 }
 
-
-int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in,
-                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out)
+int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count,
+               ptl_eq_handler_t callback,
+               ptl_handle_eq_t *handle_out)
 {
-        PtlMDAttach_in args;
-        PtlMDAttach_out ret;
-        int rc;
-
-        rc = validate_md(me_in, md_in);
-        if (rc == PTL_OK) {
-                args.eq_in = md2eq(&md_in);
-                args.me_in = me_in;
-                args.md_in = md_in;
-                args.unlink_in = unlink_in;
-                
-                rc = do_forward(me_in, PTL_MDATTACH, 
-                                &args, sizeof(args), &ret, sizeof(ret));
-        }
-
-        if (rc != PTL_OK)
-                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
-
-        if (handle_out) {
-                handle_out->nal_idx = me_in.nal_idx;
-                handle_out->cookie = ret.handle_out.cookie;
-        }
-        return ret.rc;
-}
-
+        nal_t    *nal;
+        
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&interface);
+        if (nal == NULL)
+                return PTL_NI_INVALID;
 
+        return (nal->nal_eq_alloc)(nal, count, callback, handle_out);
+}
 
-int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
-                       ptl_handle_md_t * handle_out)
+int PtlEQFree(ptl_handle_eq_t eventq)
 {
-        PtlMDBind_in args;
-        PtlMDBind_out ret;
-        int rc;
-
-        rc = validate_md(ni_in, md_in);
-        if (rc != PTL_OK)
-                return rc;
-
-        args.eq_in = md2eq(&md_in);
-        args.ni_in = ni_in;
-        args.md_in = md_in;
+        nal_t       *nal;
 
-        rc = do_forward(ni_in, PTL_MDBIND, 
-                        &args, sizeof(args), &ret, sizeof(ret));
-
-        if (rc != PTL_OK)
-                return rc;
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&eventq);
+        if (nal == NULL)
+                return PTL_EQ_INVALID;
 
-        if (handle_out) {
-                handle_out->nal_idx = ni_in.nal_idx;
-                handle_out->cookie = ret.handle_out.cookie;
-        }
-        return ret.rc;
+        return (nal->nal_eq_free)(nal, &eventq);
 }
 
-int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout,
-                ptl_md_t *new_inout, ptl_handle_eq_t testq_in)
+int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t *ev)
 {
-        PtlMDUpdate_internal_in args;
-        PtlMDUpdate_internal_out ret;
-        int rc;
-
-        args.md_in = md_in;
-
-        if (old_inout) {
-                args.old_inout = *old_inout;
-                args.old_inout_valid = 1;
-        } else
-                args.old_inout_valid = 0;
-
-        if (new_inout) {
-                rc = validate_md (md_in, *new_inout);
-                if (rc != PTL_OK)
-                        return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
-                args.new_inout = *new_inout;
-                args.new_inout_valid = 1;
-        } else
-                args.new_inout_valid = 0;
-
-        if (PtlHandleEqual (testq_in, PTL_EQ_NONE)) {
-                args.testq_in = PTL_EQ_NONE;
-                args.sequence_in = -1;
-        } else {
-                ptl_eq_t *eq = ptl_handle2usereq (&testq_in);
-                
-                args.testq_in = eq->cb_eq_handle;
-                args.sequence_in = eq->sequence;
-        }
-
-        rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret,
-                        sizeof(ret));
-        if (rc != PTL_OK)
-                return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
-
-        if (old_inout)
-                *old_inout = ret.old_inout;
-
-        return ret.rc;
+        int which;
+        
+        return (PtlEQPoll (&eventq, 1, 0, ev, &which));
 }
 
-int PtlMDUnlink(ptl_handle_md_t md_in)
+int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out)
 {
-        PtlMDUnlink_in args;
-        PtlMDUnlink_out ret;
-        int rc;
-
-        args.md_in = md_in;
-        rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret,
-                        sizeof(ret));
-        if (rc != PTL_OK)
-                return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
-
-        return ret.rc;
+        int which;
+        
+        return (PtlEQPoll (&eventq_in, 1, PTL_TIME_FOREVER, 
+                           event_out, &which));
 }
 
-int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count,
-               int (*callback) (ptl_event_t * event),
-               ptl_handle_eq_t * handle_out)
+int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout,
+              ptl_event_t *event_out, int *which_out)
 {
-        ptl_eq_t *eq = NULL;
-        ptl_event_t *ev = NULL;
-        PtlEQAlloc_in args;
-        PtlEQAlloc_out ret;
-        int rc, i;
-        nal_t *nal;
+        int           i;
+        nal_t        *nal;
 
         if (!ptl_init)
-                return PTL_NOINIT;
-        
-        nal = ptl_hndl2nal (&interface);
-        if (nal == NULL)
-                return PTL_INV_HANDLE;
-
-        if (count != LOWEST_BIT_SET(count)) {   /* not a power of 2 already */
-                do {                    /* knock off all but the top bit... */
-                        count &= ~LOWEST_BIT_SET (count);
-                } while (count != LOWEST_BIT_SET(count));
-
-                count <<= 1;                             /* ...and round up */
-        }
-
-        if (count == 0)        /* catch bad parameter / overflow on roundup */
-                return (PTL_VAL_FAILED);
-
-        PORTAL_ALLOC(ev, count * sizeof(ptl_event_t));
-        if (!ev)
-                return PTL_NOSPACE;
-
-        for (i = 0; i < count; i++)
-                ev[i].sequence = 0;
-
-        if (nal->validate != NULL) {
-                rc = nal->validate(nal, ev, count * sizeof(ptl_event_t));
-                if (rc != PTL_OK)
-                        goto fail;
-        }
-
-        args.ni_in = interface;
-        args.count_in = count;
-        args.base_in = ev;
-        args.len_in = count * sizeof(*ev);
-        args.callback_in = callback;
-
-        rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret,
-                        sizeof(ret));
-        if (rc != PTL_OK)
-                goto fail;
-        if (ret.rc)
-                GOTO(fail, rc = ret.rc);
-
-        PORTAL_ALLOC(eq, sizeof(*eq));
-        if (!eq) {
-                rc = PTL_NOSPACE;
-                goto fail;
-        }
-
-        eq->sequence = 1;
-        eq->size = count;
-        eq->base = ev;
-
-        /* EQ handles are a little wierd.  PtlEQGet() just looks at the
-         * queued events in shared memory.  It doesn't want to do_forward()
-         * at all, so the cookie in the EQ handle we pass out of here is
-         * simply a pointer to the event queue we just set up.  We stash
-         * the handle returned by do_forward(), so we can pass it back via
-         * do_forward() when we need to. */
-
-        eq->cb_eq_handle.nal_idx = interface.nal_idx;
-        eq->cb_eq_handle.cookie = ret.handle_out.cookie;
-
-        handle_out->nal_idx = interface.nal_idx;
-        handle_out->cookie = (__u64)((unsigned long)eq);
-        return PTL_OK;
-
-fail:
-        PORTAL_FREE(ev, count * sizeof(ptl_event_t));
-        return rc;
-}
+                return PTL_NO_INIT;
 
-int PtlEQFree(ptl_handle_eq_t eventq)
-{
-        PtlEQFree_in args;
-        PtlEQFree_out ret;
-        ptl_eq_t *eq;
-        int rc;
+        if (neq_in < 1)
+                return PTL_EQ_INVALID;
 
-        eq = ptl_handle2usereq (&eventq);
-        args.eventq_in = eq->cb_eq_handle;
-
-        rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args,
-                        sizeof(args), &ret, sizeof(ret));
+        nal = ptl_hndl2nal(&eventqs_in[0]);
+        if (nal == NULL)
+                return PTL_EQ_INVALID;
 
-        /* XXX we're betting rc == PTL_OK here */
-        PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t));
-        PORTAL_FREE(eq, sizeof(*eq));
+        for (i = 1; i < neq_in; i++)
+                if (ptl_hndl2nal(&eventqs_in[i]) != nal)
+                        return PTL_EQ_INVALID;
 
-        return rc;
+        return (nal->nal_eq_poll)(nal, eventqs_in, neq_in, timeout,
+                                  event_out, which_out);
 }
 
+
 int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
                ptl_process_id_t match_id_in, ptl_pt_index_t portal_in)
 {
-        PtlACEntry_in args;
-        PtlACEntry_out ret;
-        int rc;
-
-        /*
-         * Copy arguments into the argument block to
-         * hand to the forwarding object
-         */
-        args.ni_in = ni_in;
-        args.index_in = index_in;
-        args.match_id_in = match_id_in;
-        args.portal_in = portal_in;
-
-        rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret,
-                        sizeof(ret));
-
-        return (rc != PTL_OK) ? rc : ret.rc;
+        nal_t    *nal;
+
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&ni_in);
+        if (nal == NULL)
+                return PTL_NI_INVALID;
+        
+        return (nal->nal_ace_entry)(nal, index_in, match_id_in, portal_in);
 }
 
 int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
            ptl_process_id_t target_in, ptl_pt_index_t portal_in,
-           ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in,
+           ptl_ac_index_t ac_in, ptl_match_bits_t match_bits_in,
            ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in)
 {
-        PtlPut_in args;
-        PtlPut_out ret;
-        int rc;
-
-        /*
-         * Copy arguments into the argument block to
-         * hand to the forwarding object
-         */
-        args.md_in = md_in;
-        args.ack_req_in = ack_req_in;
-        args.target_in = target_in;
-        args.portal_in = portal_in;
-        args.cookie_in = cookie_in;
-        args.match_bits_in = match_bits_in;
-        args.offset_in = offset_in;
-        args.hdr_data_in = hdr_data_in;
-
-        rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret));
-
-        return (rc != PTL_OK) ? rc : ret.rc;
+        nal_t    *nal;
+
+        if (!ptl_init)
+                return PTL_NO_INIT;
+        
+        nal = ptl_hndl2nal(&md_in);
+        if (nal == NULL)
+                return PTL_MD_INVALID;
+
+        return (nal->nal_put)(nal, &md_in, ack_req_in,
+                              &target_in, portal_in, ac_in,
+                              match_bits_in, offset_in, hdr_data_in);
 }
 
 int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
-           ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in,
+           ptl_pt_index_t portal_in, ptl_ac_index_t ac_in,
            ptl_match_bits_t match_bits_in, ptl_size_t offset_in)
 {
-        PtlGet_in args;
-        PtlGet_out ret;
-        int rc;
-
-        /*
-         * Copy arguments into the argument block to
-         * hand to the forwarding object
-         */
-        args.md_in = md_in;
-        args.target_in = target_in;
-        args.portal_in = portal_in;
-        args.cookie_in = cookie_in;
-        args.match_bits_in = match_bits_in;
-        args.offset_in = offset_in;
-
-        rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret));
-
-        return (rc != PTL_OK) ? rc : ret.rc;
+        nal_t  *nal;
+
+        if (!ptl_init)
+                return PTL_NO_INIT;
+
+        nal = ptl_hndl2nal(&md_in);
+        if (nal == NULL)
+                return PTL_MD_INVALID;
+
+        return (nal->nal_get)(nal, &md_in, 
+                              &target_in, portal_in, ac_in,
+                              match_bits_in, offset_in);
 }
+
index 22565dd..285f8fe 100644 (file)
@@ -3,8 +3,8 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-my_sources = api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c \
-               lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c \
+my_sources =    api-errno.c api-ni.c api-wrap.c \
+               lib-init.c lib-me.c lib-msg.c lib-eq.c \
                lib-md.c lib-move.c lib-ni.c lib-pid.c
 
 if !CRAY_PORTALS
@@ -16,11 +16,11 @@ libportals_a_CPPFLAGS = $(LLCPPFLAGS)
 libportals_a_CFLAGS = $(LLCFLAGS)
 endif
 
-#if MODULES
-#modulenet_DATA = portals$(KMODEXT)
-#endif # MODULES
+if MODULES
+modulenet_DATA = portals$(KMODEXT)
+endif # MODULES
 
 endif # CRAY_PORTALS
 
 MOSTLYCLEANFILES = *.o *.ko *.mod.c
-#DIST_SOURCES = $(portals-objs:%.o=%.c)
+DIST_SOURCES = $(portals-objs:%.o=%.c)
diff --git a/lustre/portals/portals/lib-dispatch.c b/lustre/portals/portals/lib-dispatch.c
deleted file mode 100644 (file)
index 13036c7..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * lib/lib-dispatch.c
- *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *  Copyright (c) 2001-2002 Sandia National Laboratories
- *
- *   This file is part of Lustre, http://www.sf.net/projects/lustre/
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define DEBUG_SUBSYSTEM S_PORTALS
-#include <portals/lib-p30.h>
-#include <portals/lib-dispatch.h>
-
-typedef struct {
-        int (*fun) (nal_cb_t * nal, void *private, void *in, void *out);
-        char *name;
-} dispatch_table_t;
-
-static dispatch_table_t dispatch_table[] = {
-        [PTL_GETID] {do_PtlGetId, "PtlGetId"},
-        [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"},
-        [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"},
-        [PTL_NIDEBUG] {do_PtlNIDebug, "PtlNIDebug"},
-        [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"},
-        [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"},
-        [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"},
-        [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"},
-        [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"},
-        [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"},
-        [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"},
-        [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"},
-        [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"},
-        [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"},
-        [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"},
-        [PTL_PUT] {do_PtlPut, "PtlPut"},
-        [PTL_GET] {do_PtlGet, "PtlGet"},
-        [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"},
-        /*    */ {0, ""}
-};
-
-/*
- * This really should be elsewhere, but lib-p30/dispatch.c is
- * an automatically generated file.
- */
-void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block,
-                  void *ret_block)
-{
-        lib_ni_t *ni = &nal->ni;
-
-        if (index < 0 || index > LIB_MAX_DISPATCH ||
-            !dispatch_table[index].fun) {
-                CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index);
-                return;
-        }
-
-        CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid,
-               dispatch_table[index].name, index);
-
-        dispatch_table[index].fun(nal, private, arg_block, ret_block);
-}
-
-char *dispatch_name(int index)
-{
-        return dispatch_table[index].name;
-}
index ce343c1..8ea6fdd 100644 (file)
 
 #define DEBUG_SUBSYSTEM S_PORTALS
 #include <portals/lib-p30.h>
-#include <portals/arg-blocks.h>
 
-int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args,
-                           void *v_ret)
+int 
+lib_api_eq_alloc (nal_t *apinal, ptl_size_t count,
+                  ptl_eq_handler_t callback, 
+                  ptl_handle_eq_t *handle)
 {
-        /*
-         * Incoming:
-         *      ptl_handle_ni_t ni_in
-         *      ptl_size_t count_in
-         *      void                    * base_in
-         *
-         * Outgoing:
-         *      ptl_handle_eq_t         * handle_out
-         */
-
-        PtlEQAlloc_in *args = v_args;
-        PtlEQAlloc_out *ret = v_ret;
-
-        lib_eq_t *eq;
-        unsigned long flags;
-
-        /* api should have rounded up */
-        if (args->count_in != LOWEST_BIT_SET (args->count_in))
-                return ret->rc = PTL_VAL_FAILED;
+        lib_nal_t     *nal = apinal->nal_data;
+        lib_eq_t      *eq;
+        unsigned long  flags;
+        int            rc;
 
+        /* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+         * overflow, they don't skip entries, so the queue has the same
+         * apparant capacity at all times */
+
+        if (count != LOWEST_BIT_SET(count)) {   /* not a power of 2 already */
+                do {                    /* knock off all but the top bit... */
+                        count &= ~LOWEST_BIT_SET (count);
+                } while (count != LOWEST_BIT_SET(count));
+
+                count <<= 1;                             /* ...and round up */
+        }
+
+        if (count == 0)        /* catch bad parameter / overflow on roundup */
+                return (PTL_VAL_FAILED);
+        
         eq = lib_eq_alloc (nal);
         if (eq == NULL)
-                return (ret->rc = PTL_NOSPACE);
+                return (PTL_NO_SPACE);
 
-        state_lock(nal, &flags);
+        PORTAL_ALLOC(eq->eq_events, count * sizeof(ptl_event_t));
+        if (eq->eq_events == NULL) {
+                LIB_LOCK(nal, flags);
+                lib_eq_free (nal, eq);
+                LIB_UNLOCK(nal, flags);
+        }
 
-        if (nal->cb_map != NULL) {
+        if (nal->libnal_map != NULL) {
                 struct iovec iov = {
-                        .iov_base = args->base_in,
-                        .iov_len = args->count_in * sizeof (ptl_event_t) };
+                        .iov_base = eq->eq_events,
+                        .iov_len = count * sizeof(ptl_event_t)};
 
-                ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey);
-                if (ret->rc != PTL_OK) {
+                rc = nal->libnal_map(nal, 1, &iov, &eq->eq_addrkey);
+                if (rc != PTL_OK) {
+                        LIB_LOCK(nal, flags);
                         lib_eq_free (nal, eq);
-                        
-                        state_unlock (nal, &flags);
-                        return (ret->rc);
+                        LIB_UNLOCK(nal, flags);
+                        return (rc);
                 }
         }
 
-        eq->sequence = 1;
-        eq->base = args->base_in;
-        eq->size = args->count_in;
+        /* NB this resets all event sequence numbers to 0, to be earlier
+         * than eq_deq_seq */
+        memset(eq->eq_events, 0, count * sizeof(ptl_event_t));
+
+        eq->eq_deq_seq = 1;
+        eq->eq_enq_seq = 1;
+        eq->eq_size = count;
         eq->eq_refcount = 0;
-        eq->event_callback = args->callback_in;
+        eq->eq_callback = callback;
+
+        LIB_LOCK(nal, flags);
 
         lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ);
-        list_add (&eq->eq_list, &nal->ni.ni_active_eqs);
+        list_add (&eq->eq_list, &nal->libnal_ni.ni_active_eqs);
 
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
 
-        ptl_eq2handle(&ret->handle_out, eq);
-        return (ret->rc = PTL_OK);
+        ptl_eq2handle(handle, nal, eq);
+        return (PTL_OK);
 }
 
-int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args,
-                          void *v_ret)
+int 
+lib_api_eq_free(nal_t *apinal, ptl_handle_eq_t *eqh)
 {
-        /*
-         * Incoming:
-         *      ptl_handle_eq_t eventq_in
-         *
-         * Outgoing:
-         */
-
-        PtlEQFree_in *args = v_args;
-        PtlEQFree_out *ret = v_ret;
-        lib_eq_t *eq;
-        long flags;
+        lib_nal_t     *nal = apinal->nal_data;
+        lib_eq_t      *eq;
+        int            size;
+        ptl_event_t   *events;
+        void          *addrkey;
+        unsigned long  flags;
 
-        state_lock (nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        eq = ptl_handle2eq(&args->eventq_in, nal);
+        eq = ptl_handle2eq(eqh, nal);
         if (eq == NULL) {
-                ret->rc = PTL_INV_EQ;
-        } else if (eq->eq_refcount != 0) {
-                ret->rc = PTL_EQ_INUSE;
+                LIB_UNLOCK(nal, flags);
+                return (PTL_EQ_INVALID);
+        }
+
+        if (eq->eq_refcount != 0) {
+                LIB_UNLOCK(nal, flags);
+                return (PTL_EQ_IN_USE);
+        }
+
+        /* stash for free after lock dropped */
+        events  = eq->eq_events;
+        size    = eq->eq_size;
+        addrkey = eq->eq_addrkey;
+
+        lib_invalidate_handle (nal, &eq->eq_lh);
+        list_del (&eq->eq_list);
+        lib_eq_free (nal, eq);
+
+        LIB_UNLOCK(nal, flags);
+
+        if (nal->libnal_unmap != NULL) {
+                struct iovec iov = {
+                        .iov_base = events,
+                        .iov_len = size * sizeof(ptl_event_t)};
+
+                nal->libnal_unmap(nal, 1, &iov, &addrkey);
+        }
+
+        PORTAL_FREE(events, size * sizeof (ptl_event_t));
+
+        return (PTL_OK);
+}
+
+int
+lib_get_event (lib_eq_t *eq, ptl_event_t *ev)
+{
+        int          new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+        ptl_event_t *new_event = &eq->eq_events[new_index];
+        int          rc;
+        ENTRY;
+
+        CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+               new_event, eq->eq_deq_seq, eq->eq_size);
+
+        if (PTL_SEQ_GT (eq->eq_deq_seq, new_event->sequence)) {
+                RETURN(PTL_EQ_EMPTY);
+        }
+
+        /* We've got a new event... */
+        *ev = *new_event;
+
+        /* ...but did it overwrite an event we've not seen yet? */
+        if (eq->eq_deq_seq == new_event->sequence) {
+                rc = PTL_OK;
         } else {
-                if (nal->cb_unmap != NULL) {
-                        struct iovec iov = {
-                                .iov_base = eq->base,
-                                .iov_len = eq->size * sizeof (ptl_event_t) };
-                        
-                        nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey);
+                CERROR("Event Queue Overflow: eq seq %lu ev seq %lu\n",
+                       eq->eq_deq_seq, new_event->sequence);
+                rc = PTL_EQ_DROPPED;
+        }
+
+        eq->eq_deq_seq = new_event->sequence + 1;
+        RETURN(rc);
+}
+
+
+int
+lib_api_eq_poll (nal_t *apinal, 
+                 ptl_handle_eq_t *eventqs, int neq, int timeout_ms,
+                 ptl_event_t *event, int *which)
+{
+        lib_nal_t       *nal = apinal->nal_data;
+        lib_ni_t        *ni = &nal->libnal_ni;
+        unsigned long    flags;
+        int              i;
+        int              rc;
+#ifdef __KERNEL__
+        wait_queue_t     wq;
+        unsigned long    now;
+#else
+        struct timeval   then;
+        struct timeval   now;
+        struct timespec  ts;
+#endif
+        ENTRY;
+
+        LIB_LOCK(nal, flags);
+
+        for (;;) {
+                for (i = 0; i < neq; i++) {
+                        lib_eq_t *eq = ptl_handle2eq(&eventqs[i], nal);
+
+                        rc = lib_get_event (eq, event);
+                        if (rc != PTL_EQ_EMPTY) {
+                                LIB_UNLOCK(nal, flags);
+                                *which = i;
+                                RETURN(rc);
+                        }
+                }
+                
+                if (timeout_ms == 0) {
+                        LIB_UNLOCK (nal, flags);
+                        RETURN (PTL_EQ_EMPTY);
                 }
 
-                lib_invalidate_handle (nal, &eq->eq_lh);
-                list_del (&eq->eq_list);
-                lib_eq_free (nal, eq);
-                ret->rc = PTL_OK;
-        }
+                /* Some architectures force us to do spin locking/unlocking
+                 * in the same stack frame, means we can abstract the
+                 * locking here */
+#ifdef __KERNEL__
+                init_waitqueue_entry(&wq, current);
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue(&ni->ni_waitq, &wq);
 
-        state_unlock (nal, &flags);
+                LIB_UNLOCK(nal, flags);
 
-        return (ret->rc);
+                if (timeout_ms < 0) {
+                        schedule ();
+                } else {
+                        now = jiffies;
+                        schedule_timeout((timeout_ms * HZ)/1000);
+                        timeout_ms -= ((jiffies - now) * 1000)/HZ;
+                        if (timeout_ms < 0)
+                                timeout_ms = 0;
+                }
+                
+                LIB_LOCK(nal, flags);
+#else
+                if (timeout_ms < 0) {
+                        pthread_cond_wait(&ni->ni_cond, &ni->ni_mutex);
+                } else {
+                        gettimeofday(&then, NULL);
+                        
+                        ts.tv_sec = then.tv_sec + timeout_ms/1000;
+                        ts.tv_nsec = then.tv_usec * 1000 + 
+                                     (timeout_ms%1000) * 1000000;
+                        if (ts.tv_nsec >= 1000000000) {
+                                ts.tv_sec++;
+                                ts.tv_nsec -= 1000000000;
+                        }
+                        
+                        pthread_cond_timedwait(&ni->ni_cond,
+                                               &ni->ni_mutex, &ts);
+                        
+                        gettimeofday(&now, NULL);
+                        timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 +
+                                      (now.tv_usec - then.tv_usec) / 1000;
+                        
+                        if (timeout_ms < 0)
+                                timeout_ms = 0;
+                }
+#endif
+        }
 }
index d4d8860..9d97bc1 100644 (file)
 #ifndef PTL_USE_LIB_FREELIST
 
 int
-kportal_descriptor_setup (nal_cb_t *nal)
+kportal_descriptor_setup (lib_nal_t *nal,
+                          ptl_ni_limits_t *requested_limits,
+                          ptl_ni_limits_t *actual_limits)
 {
+        /* Ignore requested limits! */
+        actual_limits->max_mes = INT_MAX;
+        actual_limits->max_mds = INT_MAX;
+        actual_limits->max_eqs = INT_MAX;
+
         return PTL_OK;
 }
 
 void
-kportal_descriptor_cleanup (nal_cb_t *nal)
+kportal_descriptor_cleanup (lib_nal_t *nal)
 {
 }
 #else
 
 int
-lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size)
+lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int n, int size)
 {
         char *space;
 
@@ -61,9 +68,9 @@ lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size)
 
         size += offsetof (lib_freeobj_t, fo_contents);
 
-        space = nal->cb_malloc (nal, n * size);
+        PORTAL_ALLOC(space, n * size);
         if (space == NULL)
-                return (PTL_NOSPACE);
+                return (PTL_NO_SPACE);
 
         INIT_LIST_HEAD (&fl->fl_list);
         fl->fl_objs = space;
@@ -81,7 +88,7 @@ lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size)
 }
 
 void
-lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl)
+lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl)
 {
         struct list_head *el;
         int               count;
@@ -95,55 +102,67 @@ lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl)
 
         LASSERT (count == fl->fl_nobjs);
 
-        nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+        PORTAL_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
         memset (fl, 0, sizeof (fl));
 }
 
 int
-kportal_descriptor_setup (nal_cb_t *nal)
+kportal_descriptor_setup (lib_nal_t *nal,
+                          ptl_ni_limits_t *requested_limits,
+                          ptl_ni_limits_t *actual_limits)
 {
         /* NB on failure caller must still call kportal_descriptor_cleanup */
         /*               ******                                            */
-        int rc;
-
-        memset (&nal->ni.ni_free_mes,  0, sizeof (nal->ni.ni_free_mes));
-        memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs));
-        memset (&nal->ni.ni_free_mds,  0, sizeof (nal->ni.ni_free_mds));
-        memset (&nal->ni.ni_free_eqs,  0, sizeof (nal->ni.ni_free_eqs));
-
-        rc = lib_freelist_init (nal, &nal->ni.ni_free_mes,
+        lib_ni_t  *ni = &nal->libnal_ni;
+        int        rc;
+
+        memset (&ni->ni_free_mes,  0, sizeof (ni->ni_free_mes));
+        memset (&ni->ni_free_msgs, 0, sizeof (ni->ni_free_msgs));
+        memset (&ni->ni_free_mds,  0, sizeof (ni->ni_free_mds));
+        memset (&ni->ni_free_eqs,  0, sizeof (ni->ni_free_eqs));
+
+        /* Ignore requested limits! */
+        actual_limits->max_mes = MAX_MES;
+        actual_limits->max_mds = MAX_MDS;
+        actual_limits->max_eqs = MAX_EQS;
+        /* Hahahah what a load of bollocks.  There's nowhere to
+         * specify the max # messages in-flight */
+
+        rc = lib_freelist_init (nal, &ni->ni_free_mes,
                                 MAX_MES, sizeof (lib_me_t));
         if (rc != PTL_OK)
                 return (rc);
 
-        rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs,
+        rc = lib_freelist_init (nal, &ni->ni_free_msgs,
                                 MAX_MSGS, sizeof (lib_msg_t));
         if (rc != PTL_OK)
                 return (rc);
 
-        rc = lib_freelist_init (nal, &nal->ni.ni_free_mds,
+        rc = lib_freelist_init (nal, &ni->ni_free_mds,
                                 MAX_MDS, sizeof (lib_md_t));
         if (rc != PTL_OK)
                 return (rc);
 
-        rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs,
+        rc = lib_freelist_init (nal, &ni->ni_free_eqs,
                                 MAX_EQS, sizeof (lib_eq_t));
         return (rc);
 }
 
 void
-kportal_descriptor_cleanup (nal_cb_t *nal)
+kportal_descriptor_cleanup (lib_nal_t *nal)
 {
-        lib_freelist_fini (nal, &nal->ni.ni_free_mes);
-        lib_freelist_fini (nal, &nal->ni.ni_free_msgs);
-        lib_freelist_fini (nal, &nal->ni.ni_free_mds);
-        lib_freelist_fini (nal, &nal->ni.ni_free_eqs);
+        lib_ni_t   *ni = &nal->libnal_ni;
+        
+        lib_freelist_fini (nal, &ni->ni_free_mes);
+        lib_freelist_fini (nal, &ni->ni_free_msgs);
+        lib_freelist_fini (nal, &ni->ni_free_mds);
+        lib_freelist_fini (nal, &ni->ni_free_eqs);
 }
 
 #endif
 
 __u64
-lib_create_interface_cookie (nal_cb_t *nal)
+lib_create_interface_cookie (lib_nal_t *nal)
 {
         /* NB the interface cookie in wire handles guards against delayed
          * replies and ACKs appearing valid in a new instance of the same
@@ -164,9 +183,9 @@ lib_create_interface_cookie (nal_cb_t *nal)
 }
 
 int
-lib_setup_handle_hash (nal_cb_t *nal) 
+lib_setup_handle_hash (lib_nal_t *nal) 
 {
-        lib_ni_t *ni = &nal->ni;
+        lib_ni_t *ni = &nal->libnal_ni;
         int       i;
         
         /* Arbitrary choice of hash table size */
@@ -175,11 +194,10 @@ lib_setup_handle_hash (nal_cb_t *nal)
 #else
         ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4;
 #endif
-        ni->ni_lh_hash_table = 
-                (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size
-                                                    * sizeof (struct list_head));
+        PORTAL_ALLOC(ni->ni_lh_hash_table,
+                     ni->ni_lh_hash_size * sizeof (struct list_head));
         if (ni->ni_lh_hash_table == NULL)
-                return (PTL_NOSPACE);
+                return (PTL_NO_SPACE);
         
         for (i = 0; i < ni->ni_lh_hash_size; i++)
                 INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]);
@@ -190,22 +208,22 @@ lib_setup_handle_hash (nal_cb_t *nal)
 }
 
 void
-lib_cleanup_handle_hash (nal_cb_t *nal)
+lib_cleanup_handle_hash (lib_nal_t *nal)
 {
-        lib_ni_t *ni = &nal->ni;
+        lib_ni_t *ni = &nal->libnal_ni;
 
         if (ni->ni_lh_hash_table == NULL)
                 return;
         
-        nal->cb_free (nal, ni->ni_lh_hash_table,
-                      ni->ni_lh_hash_size * sizeof (struct list_head));
+        PORTAL_FREE(ni->ni_lh_hash_table,
+                    ni->ni_lh_hash_size * sizeof (struct list_head));
 }
 
 lib_handle_t *
-lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) 
+lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type) 
 {
         /* ALWAYS called with statelock held */
-        lib_ni_t            *ni = &nal->ni;
+        lib_ni_t            *ni = &nal->libnal_ni;
         struct list_head    *list;
         struct list_head    *el;
         unsigned int         hash;
@@ -227,10 +245,10 @@ lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type)
 }
 
 void
-lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) 
+lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type) 
 {
         /* ALWAYS called with statelock held */
-        lib_ni_t       *ni = &nal->ni;
+        lib_ni_t       *ni = &nal->libnal_ni;
         unsigned int    hash;
 
         LASSERT (type >= 0 && type < PTL_COOKIE_TYPES);
@@ -242,99 +260,131 @@ lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type)
 }
 
 void
-lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh)
+lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh)
 {
         list_del (&lh->lh_hash_chain);
 }
 
 int
-lib_init(nal_cb_t * nal, ptl_nid_t nid, ptl_pid_t pid, int gsize,
-         ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size)
+lib_init(lib_nal_t *libnal, nal_t *apinal, 
+         ptl_process_id_t process_id,
+         ptl_ni_limits_t *requested_limits,
+         ptl_ni_limits_t *actual_limits)
 {
         int       rc = PTL_OK;
-        lib_ni_t *ni = &nal->ni;
-        int i;
+        lib_ni_t *ni = &libnal->libnal_ni;
+        int       ptl_size;
+        int       i;
         ENTRY;
 
         /* NB serialised in PtlNIInit() */
 
-        if (ni->refcnt != 0) {                       /* already initialised */
-                ni->refcnt++;
-                goto out;
-        }
-
         lib_assert_wire_constants ();
-        
-        /*
-         * Allocate the portal table for this interface
-         * and all per-interface objects.
-         */
-        memset(&ni->counters, 0, sizeof(lib_counters_t));
 
-        rc = kportal_descriptor_setup (nal);
+        /* Setup the API nal with the lib API handling functions */
+        apinal->nal_get_id    = lib_api_get_id;
+        apinal->nal_ni_status = lib_api_ni_status;
+        apinal->nal_ni_dist   = lib_api_ni_dist;
+        apinal->nal_fail_nid  = lib_api_fail_nid;
+        apinal->nal_me_attach = lib_api_me_attach;
+        apinal->nal_me_insert = lib_api_me_insert;
+        apinal->nal_me_unlink = lib_api_me_unlink;
+        apinal->nal_md_attach = lib_api_md_attach;
+        apinal->nal_md_bind   = lib_api_md_bind;
+        apinal->nal_md_unlink = lib_api_md_unlink;
+        apinal->nal_md_update = lib_api_md_update;
+        apinal->nal_eq_alloc  = lib_api_eq_alloc;
+        apinal->nal_eq_free   = lib_api_eq_free;
+        apinal->nal_eq_poll   = lib_api_eq_poll;
+        apinal->nal_put       = lib_api_put;
+        apinal->nal_get       = lib_api_get;
+
+        apinal->nal_data      = libnal;
+        ni->ni_api            = apinal;
+
+        rc = kportal_descriptor_setup (libnal, requested_limits, 
+                                       &ni->ni_actual_limits);
         if (rc != PTL_OK)
                 goto out;
 
+        memset(&ni->ni_counters, 0, sizeof(lib_counters_t));
+
         INIT_LIST_HEAD (&ni->ni_active_msgs);
         INIT_LIST_HEAD (&ni->ni_active_mds);
         INIT_LIST_HEAD (&ni->ni_active_eqs);
-
         INIT_LIST_HEAD (&ni->ni_test_peers);
 
-        ni->ni_interface_cookie = lib_create_interface_cookie (nal);
+#ifdef __KERNEL__
+        spin_lock_init (&ni->ni_lock);
+        init_waitqueue_head (&ni->ni_waitq);
+#else
+        pthread_mutex_init(&ni->ni_mutex, NULL);
+        pthread_cond_init(&ni->ni_cond, NULL);
+#endif
+
+        ni->ni_interface_cookie = lib_create_interface_cookie (libnal);
         ni->ni_next_object_cookie = 0;
-        rc = lib_setup_handle_hash (nal);
+        rc = lib_setup_handle_hash (libnal);
         if (rc != PTL_OK)
                 goto out;
         
-        ni->nid = nid;
-        ni->pid = pid;
-
-        ni->num_nodes = gsize;
-        ni->tbl.size = ptl_size;
-
-        ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size);
-        if (ni->tbl.tbl == NULL) {
-                rc = PTL_NOSPACE;
+        ni->ni_pid = process_id;
+
+        if (requested_limits != NULL)
+                ptl_size = requested_limits->max_pt_index + 1;
+        else
+                ptl_size = 64;
+
+        ni->ni_portals.size = ptl_size;
+        PORTAL_ALLOC(ni->ni_portals.tbl,
+                     ptl_size * sizeof(struct list_head));
+        if (ni->ni_portals.tbl == NULL) {
+                rc = PTL_NO_SPACE;
                 goto out;
         }
 
         for (i = 0; i < ptl_size; i++)
-                INIT_LIST_HEAD(&(ni->tbl.tbl[i]));
+                INIT_LIST_HEAD(&(ni->ni_portals.tbl[i]));
+
+        /* max_{mes,mds,eqs} set in kportal_descriptor_setup */
+
+        /* We don't have an access control table! */
+        ni->ni_actual_limits.max_ac_index = -1;
+
+        ni->ni_actual_limits.max_pt_index = ptl_size - 1;
+        ni->ni_actual_limits.max_md_iovecs = PTL_MD_MAX_IOV;
+        ni->ni_actual_limits.max_me_list = INT_MAX;
+
+        /* We don't support PtlGetPut! */
+        ni->ni_actual_limits.max_getput_md = 0;
 
-        ni->debug = PTL_DEBUG_NONE;
-        ni->up = 1;
-        ni->refcnt++;
+        if (actual_limits != NULL)
+                *actual_limits = ni->ni_actual_limits;
 
  out:
         if (rc != PTL_OK) {
-                lib_cleanup_handle_hash (nal);
-                kportal_descriptor_cleanup (nal);
+                lib_cleanup_handle_hash (libnal);
+                kportal_descriptor_cleanup (libnal);
         }
 
         RETURN (rc);
 }
 
 int
-lib_fini(nal_cb_t * nal)
+lib_fini(lib_nal_t *nal)
 {
-        lib_ni_t *ni = &nal->ni;
+        lib_ni_t *ni = &nal->libnal_ni;
         int       idx;
 
-        ni->refcnt--;
-
-        if (ni->refcnt != 0)
-                goto out;
-
-        /* NB no stat_lock() since this is the last reference.  The NAL
+        /* NB no state_lock() since this is the last reference.  The NAL
          * should have shut down already, so it should be safe to unlink
          * and free all descriptors, even those that appear committed to a
          * network op (eg MD with non-zero pending count)
          */
 
-        for (idx = 0; idx < ni->tbl.size; idx++)
-                while (!list_empty (&ni->tbl.tbl[idx])) {
-                        lib_me_t *me = list_entry (ni->tbl.tbl[idx].next,
+        for (idx = 0; idx < ni->ni_portals.size; idx++)
+                while (!list_empty (&ni->ni_portals.tbl[idx])) {
+                        lib_me_t *me = list_entry (ni->ni_portals.tbl[idx].next,
                                                    lib_me_t, me_list);
 
                         CERROR ("Active me %p on exit\n", me);
@@ -369,12 +419,16 @@ lib_fini(nal_cb_t * nal)
                 lib_msg_free (nal, msg);
         }
 
-        nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size);
-        ni->up = 0;
+        PORTAL_FREE(ni->ni_portals.tbl,  
+                    ni->ni_portals.size * sizeof(struct list_head));
 
         lib_cleanup_handle_hash (nal);
         kportal_descriptor_cleanup (nal);
 
- out:
+#ifndef __KERNEL__
+        pthread_mutex_destroy(&ni->ni_mutex);
+        pthread_cond_destroy(&ni->ni_cond);
+#endif
+
         return (PTL_OK);
 }
index a1ed583..6deadb8 100644 (file)
 #endif
 
 #include <portals/lib-p30.h>
-#include <portals/arg-blocks.h>
 
-/*
- * must be called with state lock held
- */
-void lib_md_unlink(nal_cb_t * nal, lib_md_t * md)
+/* must be called with state lock held */
+void
+lib_md_unlink(lib_nal_t *nal, lib_md_t *md)
 {
-        lib_me_t *me = md->me;
+        if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) == 0) {
+                /* first unlink attempt... */
+                lib_me_t *me = md->me;
+
+                md->md_flags |= PTL_MD_FLAG_ZOMBIE;
+
+                /* Disassociate from ME (if any), and unlink it if it was created
+                 * with PTL_UNLINK */
+                if (me != NULL) {
+                        me->md = NULL;
+                        if (me->unlink == PTL_UNLINK)
+                                lib_me_unlink(nal, me);
+                }
+
+                /* emsure all future handle lookups fail */
+                lib_invalidate_handle(nal, &md->md_lh);
+        }
 
         if (md->pending != 0) {
                 CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
-                md->md_flags |= PTL_MD_FLAG_UNLINK;
                 return;
         }
 
         CDEBUG(D_NET, "Unlinking md %p\n", md);
 
         if ((md->options & PTL_MD_KIOV) != 0) {
-                if (nal->cb_unmap_pages != NULL)
-                        nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, 
-                                             &md->md_addrkey);
-        } else if (nal->cb_unmap != NULL)
-                nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, 
-                               &md->md_addrkey);
-
-        if (me) {
-                me->md = NULL;
-                if (me->unlink == PTL_UNLINK)
-                        lib_me_unlink(nal, me);
+                if (nal->libnal_unmap_pages != NULL)
+                        nal->libnal_unmap_pages (nal, 
+                                                 md->md_niov, 
+                                                 md->md_iov.kiov, 
+                                                 &md->md_addrkey);
+        } else if (nal->libnal_unmap != NULL) {
+                nal->libnal_unmap (nal, 
+                                   md->md_niov, md->md_iov.iov, 
+                                   &md->md_addrkey);
         }
 
-        if (md->eq != NULL)
-        {
+        if (md->eq != NULL) {
                 md->eq->eq_refcount--;
                 LASSERT (md->eq->eq_refcount >= 0);
         }
 
-        lib_invalidate_handle (nal, &md->md_lh);
         list_del (&md->md_list);
         lib_md_free(nal, md);
 }
 
 /* must be called with state lock held */
-static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private,
-                        ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink)
+static int
+lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink)
 {
-        const int     max_size_opts = PTL_MD_AUTO_UNLINK |
-                                      PTL_MD_MAX_SIZE;
         lib_eq_t     *eq = NULL;
         int           rc;
         int           i;
+        int           niov;
+        int           total_length = 0;
 
         /* NB we are passed an allocated, but uninitialised/active md.
          * if we return success, caller may lib_md_unlink() it.
          * otherwise caller may only lib_md_free() it.
          */
 
-        if (!PtlHandleEqual (*eqh, PTL_EQ_NONE)) {
-                eq = ptl_handle2eq(eqh, nal);
+        if (!PtlHandleIsEqual (umd->eq_handle, PTL_EQ_NONE)) {
+                eq = ptl_handle2eq(&umd->eq_handle, nal);
                 if (eq == NULL)
-                        return PTL_INV_EQ;
+                        return PTL_EQ_INVALID;
         }
 
-        /* Must check this _before_ allocation.  Also, note that non-iov
-         * MDs must set md_niov to 0. */
-        LASSERT((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0 ||
-                md->niov <= PTL_MD_MAX_IOV);
-
-        if ((md->options & max_size_opts) != 0 && /* max size used */
-            (md->max_size < 0 || md->max_size > md->length)) // illegal max_size
-                return PTL_INV_MD;
-
-        new->me = NULL;
-        new->start = md->start;
-        new->length = md->length;
-        new->offset = 0;
-        new->max_size = md->max_size;
-        new->unlink = unlink;
-        new->options = md->options;
-        new->user_ptr = md->user_ptr;
-        new->eq = eq;
-        new->threshold = md->threshold;
-        new->pending = 0;
-        new->md_flags = 0;
-
-        if ((md->options & PTL_MD_IOV) != 0) {
-                int total_length = 0;
-
-                if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */
-                        return PTL_INV_MD; 
-
-                new->md_niov = md->niov;
-                
-                if (nal->cb_read (nal, private, new->md_iov.iov, md->start,
-                                  md->niov * sizeof (new->md_iov.iov[0])))
-                        return PTL_SEGV;
-
-                for (i = 0; i < new->md_niov; i++) {
+        /* This implementation doesn't know how to create START events or
+         * disable END events.  Best to LASSERT our caller is compliant so
+         * we find out quickly...  */
+        LASSERT (eq == NULL ||
+                 ((umd->options & PTL_MD_EVENT_START_DISABLE) != 0 &&
+                  (umd->options & PTL_MD_EVENT_END_DISABLE) == 0));
+
+        lmd->me = NULL;
+        lmd->start = umd->start;
+        lmd->offset = 0;
+        lmd->max_size = umd->max_size;
+        lmd->options = umd->options;
+        lmd->user_ptr = umd->user_ptr;
+        lmd->eq = eq;
+        lmd->threshold = umd->threshold;
+        lmd->pending = 0;
+        lmd->md_flags = (unlink == PTL_UNLINK) ? PTL_MD_FLAG_AUTO_UNLINK : 0;
+
+        if ((umd->options & PTL_MD_IOVEC) != 0) {
+
+                if ((umd->options & PTL_MD_KIOV) != 0) /* Can't specify both */
+                        return PTL_MD_ILLEGAL; 
+
+                lmd->md_niov = niov = umd->length;
+                memcpy(lmd->md_iov.iov, umd->start,
+                       niov * sizeof (lmd->md_iov.iov[0]));
+
+                for (i = 0; i < niov; i++) {
                         /* We take the base address on trust */
-                        if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */
-                                return PTL_VAL_FAILED;
+                        if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */
+                                return PTL_MD_ILLEGAL;
 
-                        total_length += new->md_iov.iov[i].iov_len;
+                        total_length += lmd->md_iov.iov[i].iov_len;
                 }
 
-                if (md->length > total_length)
-                        return PTL_IOV_TOO_SMALL;
-                
-                if (nal->cb_map != NULL) {
-                        rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, 
-                                          &new->md_addrkey);
+                lmd->length = total_length;
+
+                if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */
+                    (umd->max_size < 0 || 
+                     umd->max_size > total_length)) // illegal max_size
+                        return PTL_MD_ILLEGAL;
+
+                if (nal->libnal_map != NULL) {
+                        rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, 
+                                              &lmd->md_addrkey);
                         if (rc != PTL_OK)
                                 return (rc);
                 }
-        } else if ((md->options & PTL_MD_KIOV) != 0) {
+        } else if ((umd->options & PTL_MD_KIOV) != 0) {
 #ifndef __KERNEL__
-                return PTL_INV_MD;
-#else
-                int total_length = 0;
-                
+                return PTL_MD_ILLEGAL;
+#else                
                 /* Trap attempt to use paged I/O if unsupported early. */
-                if (nal->cb_send_pages == NULL ||
-                    nal->cb_recv_pages == NULL)
-                        return PTL_INV_MD;
+                if (nal->libnal_send_pages == NULL ||
+                    nal->libnal_recv_pages == NULL)
+                        return PTL_MD_INVALID;
 
-                new->md_niov = md->niov;
+                lmd->md_niov = niov = umd->length;
+                memcpy(lmd->md_iov.kiov, umd->start,
+                       niov * sizeof (lmd->md_iov.kiov[0]));
 
-                if (nal->cb_read (nal, private, new->md_iov.kiov, md->start,
-                                  md->niov * sizeof (new->md_iov.kiov[0])))
-                        return PTL_SEGV;
-                
-                for (i = 0; i < new->md_niov; i++) {
+                for (i = 0; i < niov; i++) {
                         /* We take the page pointer on trust */
-                        if (new->md_iov.kiov[i].kiov_offset + 
-                            new->md_iov.kiov[i].kiov_len > PAGE_SIZE )
+                        if (lmd->md_iov.kiov[i].kiov_offset + 
+                            lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE )
                                 return PTL_VAL_FAILED; /* invalid length */
 
-                        total_length += new->md_iov.kiov[i].kiov_len;
+                        total_length += lmd->md_iov.kiov[i].kiov_len;
                 }
 
-                if (md->length > total_length)
-                        return PTL_IOV_TOO_SMALL;
+                lmd->length = total_length;
+
+                if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */
+                    (umd->max_size < 0 || 
+                     umd->max_size > total_length)) // illegal max_size
+                        return PTL_MD_ILLEGAL;
 
-                if (nal->cb_map_pages != NULL) {
-                        rc = nal->cb_map_pages (nal, new->md_niov, new->md_iov.kiov, 
-                                                &new->md_addrkey);
+                if (nal->libnal_map_pages != NULL) {
+                        rc = nal->libnal_map_pages (nal, niov, lmd->md_iov.kiov, 
+                                                    &lmd->md_addrkey);
                         if (rc != PTL_OK)
                                 return (rc);
                 }
 #endif
         } else {   /* contiguous */
-                new->md_niov = 1;
-                new->md_iov.iov[0].iov_base = md->start;
-                new->md_iov.iov[0].iov_len = md->length;
-
-                if (nal->cb_map != NULL) {
-                        rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, 
-                                          &new->md_addrkey);
+                lmd->length = umd->length;
+                lmd->md_niov = niov = 1;
+                lmd->md_iov.iov[0].iov_base = umd->start;
+                lmd->md_iov.iov[0].iov_len = umd->length;
+
+                if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */
+                    (umd->max_size < 0 || 
+                     umd->max_size > umd->length)) // illegal max_size
+                        return PTL_MD_ILLEGAL;
+
+                if (nal->libnal_map != NULL) {
+                        rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, 
+                                              &lmd->md_addrkey);
                         if (rc != PTL_OK)
                                 return (rc);
                 }
@@ -198,140 +210,125 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private,
                 eq->eq_refcount++;
 
         /* It's good; let handle2md succeed and add to active mds */
-        lib_initialise_handle (nal, &new->md_lh, PTL_COOKIE_TYPE_MD);
-        list_add (&new->md_list, &nal->ni.ni_active_mds);
+        lib_initialise_handle (nal, &lmd->md_lh, PTL_COOKIE_TYPE_MD);
+        list_add (&lmd->md_list, &nal->libnal_ni.ni_active_mds);
 
         return PTL_OK;
 }
 
 /* must be called with state lock held */
-void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new)
+void
+lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd)
 {
         /* NB this doesn't copy out all the iov entries so when a
          * discontiguous MD is copied out, the target gets to know the
          * original iov pointer (in start) and the number of entries it had
          * and that's all.
          */
-        new->start = md->start;
-        new->length = md->length;
-        new->threshold = md->threshold;
-        new->max_size = md->max_size;
-        new->options = md->options;
-        new->user_ptr = md->user_ptr;
-        ptl_eq2handle(&new->eventq, md->eq);
-        new->niov = ((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0) ? 0 : md->md_niov;
+        umd->start = lmd->start;
+        umd->length = ((lmd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0) ?
+                      lmd->length : lmd->md_niov;
+        umd->threshold = lmd->threshold;
+        umd->max_size = lmd->max_size;
+        umd->options = lmd->options;
+        umd->user_ptr = lmd->user_ptr;
+        ptl_eq2handle(&umd->eq_handle, nal, lmd->eq);
 }
 
-int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int 
+lib_api_md_attach(nal_t *apinal, ptl_handle_me_t *meh,
+                  ptl_md_t *umd, ptl_unlink_t unlink, 
+                  ptl_handle_md_t *handle)
 {
-        /*
-         * Incoming:
-         *      ptl_handle_me_t current_in
-         *      ptl_md_t md_in
-         *      ptl_unlink_t unlink_in
-         *
-         * Outgoing:
-         *      ptl_handle_md_t         * handle_out
-         */
-
-        PtlMDAttach_in *args = v_args;
-        PtlMDAttach_out *ret = v_ret;
-        lib_me_t *me;
-        lib_md_t *md;
+        lib_nal_t    *nal = apinal->nal_data;
+        lib_me_t     *me;
+        lib_md_t     *md;
         unsigned long flags;
+        int           rc;
 
-        if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 &&
-            args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */
-                return (ret->rc = PTL_IOV_TOO_MANY);
+        if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 &&
+            umd->length > PTL_MD_MAX_IOV) /* too many fragments */
+                return PTL_IOV_INVALID;
 
-        md = lib_md_alloc(nal, &args->md_in);
+        md = lib_md_alloc(nal, umd);
         if (md == NULL)
-                return (ret->rc = PTL_NOSPACE);
+                return PTL_NO_SPACE;
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        me = ptl_handle2me(&args->me_in, nal);
+        me = ptl_handle2me(meh, nal);
         if (me == NULL) {
-                ret->rc = PTL_INV_ME;
+                rc = PTL_ME_INVALID;
         } else if (me->md != NULL) {
-                ret->rc = PTL_INUSE;
+                rc = PTL_ME_IN_USE;
         } else {
-                ret->rc = lib_md_build(nal, md, private, &args->md_in,
-                                       &args->eq_in, args->unlink_in);
-
-                if (ret->rc == PTL_OK) {
+                rc = lib_md_build(nal, md, umd, unlink);
+                if (rc == PTL_OK) {
                         me->md = md;
                         md->me = me;
 
-                        ptl_md2handle(&ret->handle_out, md);
+                        ptl_md2handle(handle, nal, md);
 
-                        state_unlock (nal, &flags);
+                        LIB_UNLOCK(nal, flags);
                         return (PTL_OK);
                 }
         }
 
         lib_md_free (nal, md);
 
-        state_unlock (nal, &flags);
-        return (ret->rc);
+        LIB_UNLOCK(nal, flags);
+        return (rc);
 }
 
-int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int
+lib_api_md_bind(nal_t *apinal, 
+                ptl_md_t *umd, ptl_unlink_t unlink,
+                ptl_handle_md_t *handle)
 {
-        /*
-         * Incoming:
-         *      ptl_handle_ni_t ni_in
-         *      ptl_md_t md_in
-         *
-         * Outgoing:
-         *      ptl_handle_md_t         * handle_out
-         */
-
-        PtlMDBind_in *args = v_args;
-        PtlMDBind_out *ret = v_ret;
-        lib_md_t *md;
+        lib_nal_t    *nal = apinal->nal_data;
+        lib_md_t     *md;
         unsigned long flags;
+        int           rc;
 
-        if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 &&
-            args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */
-                return (ret->rc = PTL_IOV_TOO_MANY);
+        if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 &&
+            umd->length > PTL_MD_MAX_IOV) /* too many fragments */
+                return PTL_IOV_INVALID;
 
-        md = lib_md_alloc(nal, &args->md_in);
+        md = lib_md_alloc(nal, umd);
         if (md == NULL)
-                return (ret->rc = PTL_NOSPACE);
+                return PTL_NO_SPACE;
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        ret->rc = lib_md_build(nal, md, private,
-                               &args->md_in, &args->eq_in, PTL_UNLINK);
+        rc = lib_md_build(nal, md, umd, unlink);
 
-        if (ret->rc == PTL_OK) {
-                ptl_md2handle(&ret->handle_out, md);
+        if (rc == PTL_OK) {
+                ptl_md2handle(handle, nal, md);
 
-                state_unlock(nal, &flags);
+                LIB_UNLOCK(nal, flags);
                 return (PTL_OK);
         }
 
         lib_md_free (nal, md);
 
-        state_unlock(nal, &flags);
-        return (ret->rc);
+        LIB_UNLOCK(nal, flags);
+        return (rc);
 }
 
-int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int
+lib_api_md_unlink (nal_t *apinal, ptl_handle_md_t *mdh)
 {
-        PtlMDUnlink_in  *args = v_args;
-        PtlMDUnlink_out *ret = v_ret;
+        lib_nal_t       *nal = apinal->nal_data;
         ptl_event_t      ev;
         lib_md_t        *md;
         unsigned long    flags;
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        md = ptl_handle2md(&args->md_in, nal);
+        md = ptl_handle2md(mdh, nal);
         if (md == NULL) {
-                state_unlock(nal, &flags);
-                return (ret->rc = PTL_INV_MD);
+                LIB_UNLOCK(nal, flags);
+                return PTL_MD_INVALID;
         }
 
         /* If the MD is busy, lib_md_unlink just marks it for deletion, and
@@ -343,104 +340,87 @@ int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
                 memset(&ev, 0, sizeof(ev));
 
                 ev.type = PTL_EVENT_UNLINK;
-                ev.status = PTL_OK;
+                ev.ni_fail_type = PTL_OK;
                 ev.unlinked = 1;
-                lib_md_deconstruct(nal, md, &ev.mem_desc);
+                lib_md_deconstruct(nal, md, &ev.md);
+                ptl_md2handle(&ev.md_handle, nal, md);
                 
-                lib_enq_event_locked(nal, private, md->eq, &ev);
+                lib_enq_event_locked(nal, NULL, md->eq, &ev);
         }
 
-        lib_md_deconstruct(nal, md, &ret->status_out);
         lib_md_unlink(nal, md);
-        ret->rc = PTL_OK;
-
-        state_unlock(nal, &flags);
 
-        return (PTL_OK);
+        LIB_UNLOCK(nal, flags);
+        return PTL_OK;
 }
 
-int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
-                            void *v_ret)
+int
+lib_api_md_update (nal_t *apinal,
+                   ptl_handle_md_t *mdh,
+                   ptl_md_t *oldumd, ptl_md_t *newumd,
+                   ptl_handle_eq_t *testqh)
 {
-        /*
-         * Incoming:
-         *      ptl_handle_md_t md_in
-         *      ptl_md_t                * old_inout
-         *      ptl_md_t                * new_inout
-         *      ptl_handle_eq_t testq_in
-         *      ptl_seq_t               sequence_in
-         *
-         * Outgoing:
-         *      ptl_md_t                * old_inout
-         *      ptl_md_t                * new_inout
-         */
-        PtlMDUpdate_internal_in *args = v_args;
-        PtlMDUpdate_internal_out *ret = v_ret;
-        lib_md_t *md;
-        lib_eq_t *test_eq = NULL;
-        ptl_md_t *new = &args->new_inout;
+        lib_nal_t    *nal = apinal->nal_data;
+        lib_md_t     *md;
+        lib_eq_t     *test_eq = NULL;
         unsigned long flags;
+        int           rc;
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        md = ptl_handle2md(&args->md_in, nal);
+        md = ptl_handle2md(mdh, nal);
         if (md == NULL) {
-                 ret->rc = PTL_INV_MD;
+                 rc = PTL_MD_INVALID;
                  goto out;
         }
 
-        if (args->old_inout_valid)
-                lib_md_deconstruct(nal, md, &ret->old_inout);
+        if (oldumd != NULL)
+                lib_md_deconstruct(nal, md, oldumd);
 
-        if (!args->new_inout_valid) {
-                ret->rc = PTL_OK;
+        if (newumd == NULL) {
+                rc = PTL_OK;
                 goto out;
         }
 
-        /* XXX fttb, the new MD must be the same type wrt fragmentation */
-        if (((new->options ^ md->options) & 
-             (PTL_MD_IOV | PTL_MD_KIOV)) != 0) {
-                ret->rc = PTL_INV_MD;
-                goto out;
-        }
-
-        if (new->niov > md->md_niov) {
-                ret->rc = PTL_IOV_TOO_MANY;
+        /* XXX fttb, the new MD must be the same "shape" wrt fragmentation,
+         * since we simply overwrite the old lib-md */
+        if ((((newumd->options ^ md->options) & 
+              (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0) ||
+            ((newumd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0 && 
+             newumd->length != md->md_niov)) {
+                rc = PTL_IOV_INVALID;
                 goto out;
         } 
 
-        if (new->niov < md->md_niov) {
-                ret->rc = PTL_IOV_TOO_SMALL;
-                goto out;
-        }
-
-        if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) {
-                test_eq = ptl_handle2eq(&args->testq_in, nal);
+        if (!PtlHandleIsEqual (*testqh, PTL_EQ_NONE)) {
+                test_eq = ptl_handle2eq(testqh, nal);
                 if (test_eq == NULL) {
-                        ret->rc = PTL_INV_EQ;
+                        rc = PTL_EQ_INVALID;
                         goto out;
                 }
         }
 
         if (md->pending != 0) {
-                        ret->rc = PTL_NOUPDATE;
-                        goto out;
+                rc = PTL_MD_NO_UPDATE;
+                goto out;
         }
 
         if (test_eq == NULL ||
-            test_eq->sequence == args->sequence_in) {
+            test_eq->eq_deq_seq == test_eq->eq_enq_seq) {
                 lib_me_t *me = md->me;
+                int       unlink = (md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) ?
+                                   PTL_UNLINK : PTL_RETAIN;
 
                 // #warning this does not track eq refcounts properly 
-                ret->rc = lib_md_build(nal, md, private,
-                                       new, &new->eventq, md->unlink);
+                rc = lib_md_build(nal, md, newumd, unlink);
 
                 md->me = me;
         } else {
-                ret->rc = PTL_NOUPDATE;
+                rc = PTL_MD_NO_UPDATE;
         }
 
  out:
-        state_unlock(nal, &flags);
-        return (ret->rc);
+        LIB_UNLOCK(nal, flags);
+
+        return rc;
 }
index 31ac214..9665b4f 100644 (file)
 #endif
 
 #include <portals/lib-p30.h>
-#include <portals/arg-blocks.h>
 
-static void lib_me_dump(nal_cb_t * nal, lib_me_t * me);
-
-int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int
+lib_api_me_attach(nal_t *apinal,
+                  ptl_pt_index_t portal,
+                  ptl_process_id_t match_id, 
+                  ptl_match_bits_t match_bits, 
+                  ptl_match_bits_t ignore_bits,
+                  ptl_unlink_t unlink, ptl_ins_pos_t pos,
+                  ptl_handle_me_t *handle)
 {
-        PtlMEAttach_in *args = v_args;
-        PtlMEAttach_out *ret = v_ret;
-        lib_ni_t *ni = &nal->ni;
-        lib_ptl_t *tbl = &ni->tbl;
+        lib_nal_t    *nal = apinal->nal_data;
+        lib_ni_t     *ni = &nal->libnal_ni;
+        lib_ptl_t    *tbl = &ni->ni_portals;
+        lib_me_t     *me;
         unsigned long flags;
-        lib_me_t *me;
 
-        if (args->index_in >= tbl->size)
-                return ret->rc = PTL_INV_PTINDEX;
+        if (portal >= tbl->size)
+                return PTL_PT_INDEX_INVALID;
 
         /* Should check for valid matchid, but not yet */
-        if (0)
-                return ret->rc = PTL_INV_PROC;
 
         me = lib_me_alloc (nal);
         if (me == NULL)
-                return (ret->rc = PTL_NOSPACE);
+                return PTL_NO_SPACE;
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        me->match_id = args->match_id_in;
-        me->match_bits = args->match_bits_in;
-        me->ignore_bits = args->ignore_bits_in;
-        me->unlink = args->unlink_in;
+        me->match_id = match_id;
+        me->match_bits = match_bits;
+        me->ignore_bits = ignore_bits;
+        me->unlink = unlink;
         me->md = NULL;
 
         lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME);
 
-        if (args->position_in == PTL_INS_AFTER)
-                list_add_tail(&me->me_list, &(tbl->tbl[args->index_in]));
+        if (pos == PTL_INS_AFTER)
+                list_add_tail(&me->me_list, &(tbl->tbl[portal]));
         else
-                list_add(&me->me_list, &(tbl->tbl[args->index_in]));
+                list_add(&me->me_list, &(tbl->tbl[portal]));
 
-        ptl_me2handle(&ret->handle_out, me);
+        ptl_me2handle(handle, nal, me);
 
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
 
-        return ret->rc = PTL_OK;
+        return PTL_OK;
 }
 
-int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int
+lib_api_me_insert(nal_t *apinal,
+                  ptl_handle_me_t *current_meh,
+                  ptl_process_id_t match_id, 
+                  ptl_match_bits_t match_bits, 
+                  ptl_match_bits_t ignore_bits,
+                  ptl_unlink_t unlink, ptl_ins_pos_t pos,
+                  ptl_handle_me_t *handle)
 {
-        PtlMEInsert_in *args = v_args;
-        PtlMEInsert_out *ret = v_ret;
+        lib_nal_t    *nal = apinal->nal_data;
+        lib_me_t     *current_me;
+        lib_me_t     *new_me;
         unsigned long flags;
-        lib_me_t *me;
-        lib_me_t *new;
 
-        new = lib_me_alloc (nal);
-        if (new == NULL)
-                return (ret->rc = PTL_NOSPACE);
+        new_me = lib_me_alloc (nal);
+        if (new_me == NULL)
+                return PTL_NO_SPACE;
 
         /* Should check for valid matchid, but not yet */
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        me = ptl_handle2me(&args->current_in, nal);
-        if (me == NULL) {
-                lib_me_free (nal, new);
+        current_me = ptl_handle2me(current_meh, nal);
+        if (current_me == NULL) {
+                lib_me_free (nal, new_me);
 
-                state_unlock (nal, &flags);
-                return (ret->rc = PTL_INV_ME);
+                LIB_UNLOCK(nal, flags);
+                return PTL_ME_INVALID;
         }
 
-        new->match_id = args->match_id_in;
-        new->match_bits = args->match_bits_in;
-        new->ignore_bits = args->ignore_bits_in;
-        new->unlink = args->unlink_in;
-        new->md = NULL;
+        new_me->match_id = match_id;
+        new_me->match_bits = match_bits;
+        new_me->ignore_bits = ignore_bits;
+        new_me->unlink = unlink;
+        new_me->md = NULL;
 
-        lib_initialise_handle (nal, &new->me_lh, PTL_COOKIE_TYPE_ME);
+        lib_initialise_handle (nal, &new_me->me_lh, PTL_COOKIE_TYPE_ME);
 
-        if (args->position_in == PTL_INS_AFTER)
-                list_add_tail(&new->me_list, &me->me_list);
+        if (pos == PTL_INS_AFTER)
+                list_add_tail(&new_me->me_list, &current_me->me_list);
         else
-                list_add(&new->me_list, &me->me_list);
+                list_add(&new_me->me_list, &current_me->me_list);
 
-        ptl_me2handle(&ret->handle_out, new);
+        ptl_me2handle(handle, nal, new_me);
 
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
 
-        return ret->rc = PTL_OK;
+        return PTL_OK;
 }
 
-int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int
+lib_api_me_unlink (nal_t *apinal, ptl_handle_me_t *meh)
 {
-        PtlMEUnlink_in *args = v_args;
-        PtlMEUnlink_out *ret = v_ret;
+        lib_nal_t    *nal = apinal->nal_data;
         unsigned long flags;
-        lib_me_t *me;
+        lib_me_t     *me;
+        int           rc;
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        me = ptl_handle2me(&args->current_in, nal);
+        me = ptl_handle2me(meh, nal);
         if (me == NULL) {
-                ret->rc = PTL_INV_ME;
+                rc = PTL_ME_INVALID;
         } else {
                 lib_me_unlink(nal, me);
-                ret->rc = PTL_OK;
+                rc = PTL_OK;
         }
 
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
 
-        return (ret->rc);
+        return (rc);
 }
 
 /* call with state_lock please */
-void lib_me_unlink(nal_cb_t *nal, lib_me_t *me)
+void 
+lib_me_unlink(lib_nal_t *nal, lib_me_t *me)
 {
-        lib_ni_t *ni = &nal->ni;
-
-        if (ni->debug & PTL_DEBUG_UNLINK) {
-                ptl_handle_any_t handle;
-                ptl_me2handle(&handle, me);
-        }
-
         list_del (&me->me_list);
 
         if (me->md) {
@@ -164,64 +166,20 @@ void lib_me_unlink(nal_cb_t *nal, lib_me_t *me)
         lib_me_free(nal, me);
 }
 
-int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
-{
-        PtlTblDump_in *args = v_args;
-        PtlTblDump_out *ret = v_ret;
-        lib_ptl_t *tbl = &nal->ni.tbl;
-        ptl_handle_any_t handle;
-        struct list_head *tmp;
-        unsigned long flags;
-
-        if (args->index_in < 0 || args->index_in >= tbl->size)
-                return ret->rc = PTL_INV_PTINDEX;
-
-        nal->cb_printf(nal, "Portal table index %d\n", args->index_in);
-
-        state_lock(nal, &flags);
-        list_for_each(tmp, &(tbl->tbl[args->index_in])) {
-                lib_me_t *me = list_entry(tmp, lib_me_t, me_list);
-                ptl_me2handle(&handle, me);
-                lib_me_dump(nal, me);
-        }
-        state_unlock(nal, &flags);
-
-        return ret->rc = PTL_OK;
-}
-
-int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+#if 0
+static void 
+lib_me_dump(lib_nal_t *nal, lib_me_t * me)
 {
-        PtlMEDump_in *args = v_args;
-        PtlMEDump_out *ret = v_ret;
-        lib_me_t *me;
-        unsigned long flags;
-
-        state_lock(nal, &flags);
-
-        me = ptl_handle2me(&args->current_in, nal);
-        if (me == NULL) {
-                ret->rc = PTL_INV_ME;
-        } else {
-                lib_me_dump(nal, me);
-                ret->rc = PTL_OK;
-        }
+        CWARN("Match Entry %p ("LPX64")\n", me, 
+              me->me_lh.lh_cookie);
 
-        state_unlock(nal, &flags);
+        CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+              me->match_bits, me->ignore_bits);
 
-        return ret->rc;
-}
-
-static void lib_me_dump(nal_cb_t * nal, lib_me_t * me)
-{
-        nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, 
-                       me->me_lh.lh_cookie);
-
-        nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n",
-                       me->match_bits, me->ignore_bits);
-
-        nal->cb_printf(nal, "\tMD\t= %p\n", me->md);
-        nal->cb_printf(nal, "\tprev\t= %p\n",
-                       list_entry(me->me_list.prev, lib_me_t, me_list));
-        nal->cb_printf(nal, "\tnext\t= %p\n",
-                       list_entry(me->me_list.next, lib_me_t, me_list));
+        CWARN("\tMD\t= %p\n", me->md);
+        CWARN("\tprev\t= %p\n",
+              list_entry(me->me_list.prev, lib_me_t, me_list));
+        CWARN("\tnext\t= %p\n",
+              list_entry(me->me_list.next, lib_me_t, me_list));
 }
+#endif
index ecd543c..d584f1c 100644 (file)
 #endif
 #include <portals/p30.h>
 #include <portals/lib-p30.h>
-#include <portals/arg-blocks.h>
 
-/*
- * Right now it does not check access control lists.
- *
- * We only support one MD per ME, which is how the Portals 3.1 spec is written.
- * All previous complication is removed.
- */
+/* forward ref */
+static void lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg);
 
-static lib_me_t *
-lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid,
-            ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset,
-            ptl_match_bits_t match_bits, ptl_size_t *mlength_out,
-            ptl_size_t *offset_out, int *unlink_out)
+static lib_md_t *
+lib_match_md(lib_nal_t *nal, int index, int op_mask, 
+             ptl_nid_t src_nid, ptl_pid_t src_pid, 
+             ptl_size_t rlength, ptl_size_t roffset,
+             ptl_match_bits_t match_bits, lib_msg_t *msg,
+             ptl_size_t *mlength_out, ptl_size_t *offset_out)
 {
-        lib_ni_t         *ni = &nal->ni;
-        struct list_head *match_list = &ni->tbl.tbl[index];
+        lib_ni_t         *ni = &nal->libnal_ni;
+        struct list_head *match_list = &ni->ni_portals.tbl[index];
         struct list_head *tmp;
         lib_me_t         *me;
         lib_md_t         *md;
         ptl_size_t        mlength;
         ptl_size_t        offset;
-
         ENTRY;
 
         CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d "
                 "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits);
 
-        if (index < 0 || index >= ni->tbl.size) {
+        if (index < 0 || index >= ni->ni_portals.size) {
                 CERROR("Invalid portal %d not in [0-%d]\n",
-                       index, ni->tbl.size);
+                       index, ni->ni_portals.size);
                 goto failed;
         }
 
@@ -75,18 +70,21 @@ lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid,
 
                 LASSERT (me == md->me);
 
-                /* MD deactivated */
-                if (md->threshold == 0)
-                        continue;
-
                 /* mismatched MD op */
                 if ((md->options & op_mask) == 0)
                         continue;
 
+                /* MD exhausted */
+                if (lib_md_exhausted(md))
+                        continue;
+
                 /* mismatched ME nid/pid? */
                 if (me->match_id.nid != PTL_NID_ANY &&
                     me->match_id.nid != src_nid)
                         continue;
+                
+                CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n",
+                       me->match_id.pid, src_pid);
 
                 if (me->match_id.pid != PTL_PID_ANY &&
                     me->match_id.pid != src_pid)
@@ -103,10 +101,12 @@ lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid,
                 else
                         offset = roffset;
 
-                mlength = md->length - offset;
-                if ((md->options & PTL_MD_MAX_SIZE) != 0 &&
-                    mlength > md->max_size)
+                if ((md->options & PTL_MD_MAX_SIZE) != 0) {
                         mlength = md->max_size;
+                        LASSERT (md->offset + mlength <= md->length);
+                } else {
+                        mlength = md->length - offset;
+                }
 
                 if (rlength <= mlength) {        /* fits in allowed space */
                         mlength = rlength;
@@ -118,78 +118,103 @@ lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid,
                         goto failed;
                 }
 
+                /* Commit to this ME/MD */
+                CDEBUG(D_NET, "Incoming %s index %x from "LPU64"/%u of "
+                       "length %d/%d into md "LPX64" [%d] + %d\n", 
+                       (op_mask == PTL_MD_OP_PUT) ? "put" : "get",
+                       index, src_nid, src_pid, mlength, rlength, 
+                       md->md_lh.lh_cookie, md->md_niov, offset);
+
+                lib_commit_md(nal, md, msg);
                 md->offset = offset + mlength;
 
+                /* NB Caller sets ev.type and ev.hdr_data */
+                msg->ev.initiator.nid = src_nid;
+                msg->ev.initiator.pid = src_pid;
+                msg->ev.pt_index = index;
+                msg->ev.match_bits = match_bits;
+                msg->ev.rlength = rlength;
+                msg->ev.mlength = mlength;
+                msg->ev.offset = offset;
+
+                lib_md_deconstruct(nal, md, &msg->ev.md);
+                ptl_md2handle(&msg->ev.md_handle, nal, md);
+
                 *offset_out = offset;
                 *mlength_out = mlength;
-                *unlink_out = ((md->options & PTL_MD_AUTO_UNLINK) != 0 &&
-                               md->offset >= (md->length - md->max_size));
-                RETURN (me);
+
+                /* Auto-unlink NOW, so the ME gets unlinked if required.
+                 * We bumped md->pending above so the MD just gets flagged
+                 * for unlink when it is finalized. */
+                if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) != 0 &&
+                    lib_md_exhausted(md))
+                        lib_md_unlink(nal, md);
+
+                RETURN (md);
         }
 
  failed:
         CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64
                 " offset %d length %d: no match\n",
-                ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT",
+                ni->ni_pid.nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT",
                 src_nid, src_pid, index, match_bits, roffset, rlength);
         RETURN(NULL);
 }
 
-int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret)
+int lib_api_fail_nid (nal_t *apinal, ptl_nid_t nid, unsigned int threshold)
 {
-        PtlFailNid_in     *args = v_args;
-        PtlFailNid_out    *ret  = v_ret;
+        lib_nal_t         *nal = apinal->nal_data;
         lib_test_peer_t   *tp;
         unsigned long      flags;
         struct list_head  *el;
         struct list_head  *next;
         struct list_head   cull;
         
-        if (args->threshold != 0) {
+        if (threshold != 0) {
                 /* Adding a new entry */
-                tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp));
+                PORTAL_ALLOC(tp, sizeof(*tp));
                 if (tp == NULL)
-                        return (ret->rc = PTL_FAIL);
+                        return PTL_NO_SPACE;
                 
-                tp->tp_nid = args->nid;
-                tp->tp_threshold = args->threshold;
+                tp->tp_nid = nid;
+                tp->tp_threshold = threshold;
                 
-                state_lock (nal, &flags);
-                list_add (&tp->tp_list, &nal->ni.ni_test_peers);
-                state_unlock (nal, &flags);
-                return (ret->rc = PTL_OK);
+                LIB_LOCK(nal, flags);
+                list_add_tail (&tp->tp_list, &nal->libnal_ni.ni_test_peers);
+                LIB_UNLOCK(nal, flags);
+                return PTL_OK;
         }
         
         /* removing entries */
         INIT_LIST_HEAD (&cull);
         
-        state_lock (nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        list_for_each_safe (el, next, &nal->ni.ni_test_peers) {
+        list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) {
                 tp = list_entry (el, lib_test_peer_t, tp_list);
                 
                 if (tp->tp_threshold == 0 ||    /* needs culling anyway */
-                    args->nid == PTL_NID_ANY || /* removing all entries */
-                    tp->tp_nid == args->nid)    /* matched this one */
+                    nid == PTL_NID_ANY ||       /* removing all entries */
+                    tp->tp_nid == nid)          /* matched this one */
                 {
                         list_del (&tp->tp_list);
                         list_add (&tp->tp_list, &cull);
                 }
         }
         
-        state_unlock (nal, &flags);
+        LIB_UNLOCK(nal, flags);
                 
         while (!list_empty (&cull)) {
                 tp = list_entry (cull.next, lib_test_peer_t, tp_list);
 
                 list_del (&tp->tp_list);
-                nal->cb_free (nal, tp, sizeof (*tp));
+                PORTAL_FREE(tp, sizeof (*tp));
         }
-        return (ret->rc = PTL_OK);
+        return PTL_OK;
 }
 
 static int
-fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) 
+fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing) 
 {
         lib_test_peer_t  *tp;
         struct list_head *el;
@@ -200,9 +225,9 @@ fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing)
 
         INIT_LIST_HEAD (&cull);
         
-        state_lock (nal, &flags);
+        LIB_LOCK (nal, flags);
 
-        list_for_each_safe (el, next, &nal->ni.ni_test_peers) {
+        list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) {
                 tp = list_entry (el, lib_test_peer_t, tp_list);
 
                 if (tp->tp_threshold == 0) {
@@ -234,13 +259,13 @@ fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing)
                 }
         }
         
-        state_unlock (nal, &flags);
+        LIB_UNLOCK (nal, flags);
 
         while (!list_empty (&cull)) {
                 tp = list_entry (cull.next, lib_test_peer_t, tp_list);
                 list_del (&tp->tp_list);
                 
-                nal->cb_free (nal, tp, sizeof (*tp));
+                PORTAL_FREE(tp, sizeof (*tp));
         }
 
         return (fail);
@@ -531,52 +556,52 @@ lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
 #endif
 
 ptl_err_t
-lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
           ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
 {
         if (mlen == 0)
-                return (nal->cb_recv(nal, private, msg,
-                                     0, NULL,
-                                     offset, mlen, rlen));
+                return (nal->libnal_recv(nal, private, msg,
+                                         0, NULL,
+                                         offset, mlen, rlen));
 
         if ((md->options & PTL_MD_KIOV) == 0)
-                return (nal->cb_recv(nal, private, msg,
-                                     md->md_niov, md->md_iov.iov, 
-                                     offset, mlen, rlen));
+                return (nal->libnal_recv(nal, private, msg,
+                                         md->md_niov, md->md_iov.iov, 
+                                         offset, mlen, rlen));
 
-        return (nal->cb_recv_pages(nal, private, msg, 
-                                   md->md_niov, md->md_iov.kiov,
-                                   offset, mlen, rlen));
+        return (nal->libnal_recv_pages(nal, private, msg, 
+                                       md->md_niov, md->md_iov.kiov,
+                                       offset, mlen, rlen));
 }
 
 ptl_err_t
-lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg,
           ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
           lib_md_t *md, ptl_size_t offset, ptl_size_t len) 
 {
         if (len == 0)
-                return (nal->cb_send(nal, private, msg,
-                                     hdr, type, nid, pid,
-                                     0, NULL,
-                                     offset, len));
+                return (nal->libnal_send(nal, private, msg,
+                                         hdr, type, nid, pid,
+                                         0, NULL,
+                                         offset, len));
         
         if ((md->options & PTL_MD_KIOV) == 0)
-                return (nal->cb_send(nal, private, msg, 
-                                     hdr, type, nid, pid,
-                                     md->md_niov, md->md_iov.iov,
-                                     offset, len));
-
-        return (nal->cb_send_pages(nal, private, msg, 
-                                   hdr, type, nid, pid,
-                                   md->md_niov, md->md_iov.kiov,
-                                   offset, len));
+                return (nal->libnal_send(nal, private, msg, 
+                                         hdr, type, nid, pid,
+                                         md->md_niov, md->md_iov.iov,
+                                         offset, len));
+
+        return (nal->libnal_send_pages(nal, private, msg, 
+                                       hdr, type, nid, pid,
+                                       md->md_niov, md->md_iov.kiov,
+                                       offset, len));
 }
 
 static void
-lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg)
+lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg)
 {
-        /* ALWAYS called holding the state_lock */
-        lib_counters_t *counters = &nal->ni.counters;
+        /* ALWAYS called holding the LIB_LOCK */
+        lib_counters_t *counters = &nal->libnal_ni.ni_counters;
 
         /* Here, we commit the MD to a network OP by marking it busy and
          * decrementing its threshold.  Come what may, the network "owns"
@@ -593,11 +618,11 @@ lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg)
         if (counters->msgs_alloc > counters->msgs_max)
                 counters->msgs_max = counters->msgs_alloc;
 
-        list_add (&msg->msg_list, &nal->ni.ni_active_msgs);
+        list_add (&msg->msg_list, &nal->libnal_ni.ni_active_msgs);
 }
 
 static void
-lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr)
+lib_drop_message (lib_nal_t *nal, void *private, ptl_hdr_t *hdr)
 {
         unsigned long flags;
 
@@ -605,10 +630,10 @@ lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr)
          * to receive (init_msg() not called) and therefore can't cause an
          * event. */
         
-        state_lock(nal, &flags);
-        nal->ni.counters.drop_count++;
-        nal->ni.counters.drop_length += hdr->payload_length;
-        state_unlock(nal, &flags);
+        LIB_LOCK(nal, flags);
+        nal->libnal_ni.ni_counters.drop_count++;
+        nal->libnal_ni.ni_counters.drop_length += hdr->payload_length;
+        LIB_UNLOCK(nal, flags);
 
         /* NULL msg => if NAL calls lib_finalize it will be a noop */
         (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
@@ -622,146 +647,98 @@ lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr)
  *
  */
 static ptl_err_t
-parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
+parse_put(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
-        lib_ni_t        *ni = &nal->ni;
+        lib_ni_t        *ni = &nal->libnal_ni;
         ptl_size_t       mlength = 0;
         ptl_size_t       offset = 0;
-        int              unlink = 0;
         ptl_err_t        rc;
-        lib_me_t        *me;
         lib_md_t        *md;
         unsigned long    flags;
                 
         /* Convert put fields to host byte order */
-        hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits);
-        hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index);
-        hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset);
-
-        state_lock(nal, &flags);
-
-        me = lib_find_me(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT,
-                         hdr->src_nid, hdr->src_pid,
-                         hdr->payload_length, hdr->msg.put.offset,
-                         hdr->msg.put.match_bits,
-                         &mlength, &offset, &unlink);
-        if (me == NULL) {
-                state_unlock(nal, &flags);
+        hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+        hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index);
+        hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset);
+
+        LIB_LOCK(nal, flags);
+
+        md = lib_match_md(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT,
+                          hdr->src_nid, hdr->src_pid,
+                          hdr->payload_length, hdr->msg.put.offset,
+                          hdr->msg.put.match_bits, msg,
+                          &mlength, &offset);
+        if (md == NULL) {
+                LIB_UNLOCK(nal, flags);
                 return (PTL_FAIL);
         }
 
-        md = me->md;
-        CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d "
-               "into md "LPX64" [%d] + %d\n", hdr->msg.put.ptl_index,
-               hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, 
-               md->md_lh.lh_cookie, md->md_niov, offset);
-
-        lib_commit_md(nal, md, msg);
-
-        msg->ev.type = PTL_EVENT_PUT;
-        msg->ev.initiator.nid = hdr->src_nid;
-        msg->ev.initiator.pid = hdr->src_pid;
-        msg->ev.portal = hdr->msg.put.ptl_index;
-        msg->ev.match_bits = hdr->msg.put.match_bits;
-        msg->ev.rlength = hdr->payload_length;
-        msg->ev.mlength = mlength;
-        msg->ev.offset = offset;
+        msg->ev.type = PTL_EVENT_PUT_END;
         msg->ev.hdr_data = hdr->msg.put.hdr_data;
 
-        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-
         if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
             !(md->options & PTL_MD_ACK_DISABLE)) {
                 msg->ack_wmd = hdr->msg.put.ack_wmd;
         }
 
-        ni->counters.recv_count++;
-        ni->counters.recv_length += mlength;
+        ni->ni_counters.recv_count++;
+        ni->ni_counters.recv_length += mlength;
 
-        /* only unlink after MD's pending count has been bumped in
-         * lib_commit_md() otherwise lib_me_unlink() will nuke it */
-        if (unlink)
-                lib_me_unlink (nal, me);
-
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
 
         rc = lib_recv(nal, private, msg, md, offset, mlength,
                       hdr->payload_length);
         if (rc != PTL_OK)
                 CERROR(LPU64": error on receiving PUT from "LPU64": %d\n",
-                       ni->nid, hdr->src_nid, rc);
+                       ni->ni_pid.nid, hdr->src_nid, rc);
 
         return (rc);
 }
 
 static ptl_err_t
-parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
+parse_get(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
-        lib_ni_t        *ni = &nal->ni;
+        lib_ni_t        *ni = &nal->libnal_ni;
         ptl_size_t       mlength = 0;
         ptl_size_t       offset = 0;
-        int              unlink = 0;
-        lib_me_t        *me;
         lib_md_t        *md;
         ptl_hdr_t        reply;
         unsigned long    flags;
         int              rc;
 
         /* Convert get fields to host byte order */
-        hdr->msg.get.match_bits = NTOH__u64 (hdr->msg.get.match_bits);
-        hdr->msg.get.ptl_index = NTOH__u32 (hdr->msg.get.ptl_index);
-        hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length);
-        hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset);
-
-        state_lock(nal, &flags);
-
-        me = lib_find_me(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET,
-                         hdr->src_nid, hdr->src_pid,
-                         hdr->msg.get.sink_length, hdr->msg.get.src_offset,
-                         hdr->msg.get.match_bits,
-                         &mlength, &offset, &unlink);
-        if (me == NULL) {
-                state_unlock(nal, &flags);
+        hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+        hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index);
+        hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length);
+        hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset);
+
+        LIB_LOCK(nal, flags);
+
+        md = lib_match_md(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET,
+                          hdr->src_nid, hdr->src_pid,
+                          hdr->msg.get.sink_length, hdr->msg.get.src_offset,
+                          hdr->msg.get.match_bits, msg,
+                          &mlength, &offset);
+        if (md == NULL) {
+                LIB_UNLOCK(nal, flags);
                 return (PTL_FAIL);
         }
 
-        md = me->md;
-        CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d "
-               "from md "LPX64" [%d] + %d\n", hdr->msg.get.ptl_index,
-               hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, 
-               md->md_lh.lh_cookie, md->md_niov, offset);
-
-        lib_commit_md(nal, md, msg);
-
-        msg->ev.type = PTL_EVENT_GET;
-        msg->ev.initiator.nid = hdr->src_nid;
-        msg->ev.initiator.pid = hdr->src_pid;
-        msg->ev.portal = hdr->msg.get.ptl_index;
-        msg->ev.match_bits = hdr->msg.get.match_bits;
-        msg->ev.rlength = hdr->payload_length;
-        msg->ev.mlength = mlength;
-        msg->ev.offset = offset;
+        msg->ev.type = PTL_EVENT_GET_END;
         msg->ev.hdr_data = 0;
 
-        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
-
-        ni->counters.send_count++;
-        ni->counters.send_length += mlength;
+        ni->ni_counters.send_count++;
+        ni->ni_counters.send_length += mlength;
 
-        /* only unlink after MD's refcount has been bumped in
-         * lib_commit_md() otherwise lib_me_unlink() will nuke it */
-        if (unlink)
-                lib_me_unlink (nal, me);
-
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
 
         memset (&reply, 0, sizeof (reply));
-        reply.type     = HTON__u32 (PTL_MSG_REPLY);
-        reply.dest_nid = HTON__u64 (hdr->src_nid);
-        reply.src_nid  = HTON__u64 (ni->nid);
-        reply.dest_pid = HTON__u32 (hdr->src_pid);
-        reply.src_pid  = HTON__u32 (ni->pid);
-        reply.payload_length = HTON__u32 (mlength);
+        reply.type     = cpu_to_le32(PTL_MSG_REPLY);
+        reply.dest_nid = cpu_to_le64(hdr->src_nid);
+        reply.dest_pid = cpu_to_le32(hdr->src_pid);
+        reply.src_nid  = cpu_to_le64(ni->ni_pid.nid);
+        reply.src_pid  = cpu_to_le32(ni->ni_pid.pid);
+        reply.payload_length = cpu_to_le32(mlength);
 
         reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd;
 
@@ -772,7 +749,7 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
                        hdr->src_nid, hdr->src_pid, md, offset, mlength);
         if (rc != PTL_OK)
                 CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n",
-                       ni->nid, hdr->src_nid, rc);
+                       ni->ni_pid.nid, hdr->src_nid, rc);
 
         /* Discard any junk after the hdr */
         (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
@@ -781,27 +758,27 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 }
 
 static ptl_err_t
-parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
+parse_reply(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
-        lib_ni_t        *ni = &nal->ni;
+        lib_ni_t        *ni = &nal->libnal_ni;
         lib_md_t        *md;
         int              rlength;
         int              length;
         unsigned long    flags;
         ptl_err_t        rc;
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
         /* NB handles only looked up by creator (no flips) */
         md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal);
         if (md == NULL || md->threshold == 0) {
                 CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n",
-                        ni->nid, hdr->src_nid,
+                        ni->ni_pid.nid, hdr->src_nid,
                         md == NULL ? "invalid" : "inactive",
                         hdr->msg.reply.dst_wmd.wh_interface_cookie,
                         hdr->msg.reply.dst_wmd.wh_object_cookie);
 
-                state_unlock(nal, &flags);
+                LIB_UNLOCK(nal, flags);
                 return (PTL_FAIL);
         }
 
@@ -813,10 +790,10 @@ parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
                 if ((md->options & PTL_MD_TRUNCATE) == 0) {
                         CERROR (LPU64": Dropping REPLY from "LPU64
                                 " length %d for MD "LPX64" would overflow (%d)\n",
-                                ni->nid, hdr->src_nid, length,
+                                ni->ni_pid.nid, hdr->src_nid, length,
                                 hdr->msg.reply.dst_wmd.wh_object_cookie,
                                 md->length);
-                        state_unlock(nal, &flags);
+                        LIB_UNLOCK(nal, flags);
                         return (PTL_FAIL);
                 }
                 length = md->length;
@@ -828,56 +805,57 @@ parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 
         lib_commit_md(nal, md, msg);
 
-        msg->ev.type = PTL_EVENT_REPLY;
+        msg->ev.type = PTL_EVENT_REPLY_END;
         msg->ev.initiator.nid = hdr->src_nid;
         msg->ev.initiator.pid = hdr->src_pid;
         msg->ev.rlength = rlength;
         msg->ev.mlength = length;
         msg->ev.offset = 0;
 
-        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        lib_md_deconstruct(nal, md, &msg->ev.md);
+        ptl_md2handle(&msg->ev.md_handle, nal, md);
 
-        ni->counters.recv_count++;
-        ni->counters.recv_length += length;
+        ni->ni_counters.recv_count++;
+        ni->ni_counters.recv_length += length;
 
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
 
         rc = lib_recv(nal, private, msg, md, 0, length, rlength);
         if (rc != PTL_OK)
                 CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n",
-                       ni->nid, hdr->src_nid, rc);
+                       ni->ni_pid.nid, hdr->src_nid, rc);
 
         return (rc);
 }
 
 static ptl_err_t
-parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
+parse_ack(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
 {
-        lib_ni_t      *ni = &nal->ni;
+        lib_ni_t      *ni = &nal->libnal_ni;
         lib_md_t      *md;
         unsigned long  flags;
 
         /* Convert ack fields to host byte order */
-        hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits);
-        hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength);
+        hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+        hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
         /* NB handles only looked up by creator (no flips) */
         md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal);
         if (md == NULL || md->threshold == 0) {
                 CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD "
-                       LPX64"."LPX64"\n", ni->nid, hdr->src_nid, 
+                       LPX64"."LPX64"\n", ni->ni_pid.nid, hdr->src_nid, 
                        (md == NULL) ? "invalid" : "inactive",
                        hdr->msg.ack.dst_wmd.wh_interface_cookie,
                        hdr->msg.ack.dst_wmd.wh_object_cookie);
 
-                state_unlock(nal, &flags);
+                LIB_UNLOCK(nal, flags);
                 return (PTL_FAIL);
         }
 
         CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n",
-               ni->nid, hdr->src_nid, 
+               ni->ni_pid.nid, hdr->src_nid, 
                hdr->msg.ack.dst_wmd.wh_object_cookie);
 
         lib_commit_md(nal, md, msg);
@@ -888,11 +866,12 @@ parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
         msg->ev.mlength = hdr->msg.ack.mlength;
         msg->ev.match_bits = hdr->msg.ack.match_bits;
 
-        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        lib_md_deconstruct(nal, md, &msg->ev.md);
+        ptl_md2handle(&msg->ev.md_handle, nal, md);
 
-        ni->counters.recv_count++;
+        ni->ni_counters.recv_count++;
 
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
         
         /* We have received and matched up the ack OK, create the
          * completion event now... */
@@ -923,129 +902,154 @@ hdr_type_string (ptl_hdr_t *hdr)
         }
 }
 
-void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr)
+void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr)
 {
         char *type_str = hdr_type_string (hdr);
 
-        nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str);
-        nal->cb_printf(nal, "    From nid/pid %Lu/%Lu", hdr->src_nid,
-                       hdr->src_pid);
-        nal->cb_printf(nal, "    To nid/pid %Lu/%Lu\n", hdr->dest_nid,
-                       hdr->dest_pid);
+        CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+        CWARN("    From nid/pid "LPX64"/%u", hdr->src_nid, hdr->src_pid);
+        CWARN("    To nid/pid "LPX64"/%u\n", hdr->dest_nid, hdr->dest_pid);
 
         switch (hdr->type) {
         default:
                 break;
 
         case PTL_MSG_PUT:
-                nal->cb_printf(nal,
-                               "    Ptl index %d, ack md "LPX64"."LPX64", "
-                               "match bits "LPX64"\n",
-                               hdr->msg.put.ptl_index,
-                               hdr->msg.put.ack_wmd.wh_interface_cookie,
-                               hdr->msg.put.ack_wmd.wh_object_cookie,
-                               hdr->msg.put.match_bits);
-                nal->cb_printf(nal,
-                               "    Length %d, offset %d, hdr data "LPX64"\n",
-                               hdr->payload_length, hdr->msg.put.offset,
-                               hdr->msg.put.hdr_data);
+                CWARN("    Ptl index %d, ack md "LPX64"."LPX64", "
+                      "match bits "LPX64"\n",
+                      hdr->msg.put.ptl_index,
+                      hdr->msg.put.ack_wmd.wh_interface_cookie,
+                      hdr->msg.put.ack_wmd.wh_object_cookie,
+                      hdr->msg.put.match_bits);
+                CWARN("    Length %d, offset %d, hdr data "LPX64"\n",
+                      hdr->payload_length, hdr->msg.put.offset,
+                      hdr->msg.put.hdr_data);
                 break;
 
         case PTL_MSG_GET:
-                nal->cb_printf(nal,
-                               "    Ptl index %d, return md "LPX64"."LPX64", "
-                               "match bits "LPX64"\n", hdr->msg.get.ptl_index,
-                               hdr->msg.get.return_wmd.wh_interface_cookie,
-                               hdr->msg.get.return_wmd.wh_object_cookie,
-                               hdr->msg.get.match_bits);
-                nal->cb_printf(nal,
-                               "    Length %d, src offset %d\n",
-                               hdr->msg.get.sink_length,
-                               hdr->msg.get.src_offset);
+                CWARN("    Ptl index %d, return md "LPX64"."LPX64", "
+                      "match bits "LPX64"\n", hdr->msg.get.ptl_index,
+                      hdr->msg.get.return_wmd.wh_interface_cookie,
+                      hdr->msg.get.return_wmd.wh_object_cookie,
+                      hdr->msg.get.match_bits);
+                CWARN("    Length %d, src offset %d\n",
+                      hdr->msg.get.sink_length,
+                      hdr->msg.get.src_offset);
                 break;
 
         case PTL_MSG_ACK:
-                nal->cb_printf(nal, "    dst md "LPX64"."LPX64", "
-                               "manipulated length %d\n",
-                               hdr->msg.ack.dst_wmd.wh_interface_cookie,
-                               hdr->msg.ack.dst_wmd.wh_object_cookie,
-                               hdr->msg.ack.mlength);
+                CWARN("    dst md "LPX64"."LPX64", "
+                      "manipulated length %d\n",
+                      hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                      hdr->msg.ack.dst_wmd.wh_object_cookie,
+                      hdr->msg.ack.mlength);
                 break;
 
         case PTL_MSG_REPLY:
-                nal->cb_printf(nal, "    dst md "LPX64"."LPX64", "
-                               "length %d\n",
-                               hdr->msg.reply.dst_wmd.wh_interface_cookie,
-                               hdr->msg.reply.dst_wmd.wh_object_cookie,
-                               hdr->payload_length);
+                CWARN("    dst md "LPX64"."LPX64", "
+                      "length %d\n",
+                      hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                      hdr->msg.reply.dst_wmd.wh_object_cookie,
+                      hdr->payload_length);
         }
 
 }                               /* end of print_hdr() */
 
 
-void 
-lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private)
+ptl_err_t
+lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, void *private)
 {
         unsigned long  flags;
         ptl_err_t      rc;
         lib_msg_t     *msg;
+
+        /* NB we return PTL_OK if we manage to parse the header and believe
+         * it looks OK.  Anything that goes wrong with receiving the
+         * message after that point is the responsibility of the NAL */
         
         /* convert common fields to host byte order */
-        hdr->dest_nid = NTOH__u64 (hdr->dest_nid);
-        hdr->src_nid = NTOH__u64 (hdr->src_nid);
-        hdr->dest_pid = NTOH__u32 (hdr->dest_pid);
-        hdr->src_pid = NTOH__u32 (hdr->src_pid);
-        hdr->type = NTOH__u32 (hdr->type);
-        hdr->payload_length = NTOH__u32(hdr->payload_length);
-#if 0
-        nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n",
-                       nal->ni.nid, nal, hdr, hdr->type);
-        print_hdr(nal, hdr);
-#endif
-        if (hdr->type == PTL_MSG_HELLO) {
+        hdr->type = le32_to_cpu(hdr->type);
+        hdr->src_nid = le64_to_cpu(hdr->src_nid);
+        hdr->src_pid = le32_to_cpu(hdr->src_pid);
+        hdr->dest_pid = le32_to_cpu(hdr->dest_pid);
+        hdr->payload_length = le32_to_cpu(hdr->payload_length);
+
+        switch (hdr->type) {
+        case PTL_MSG_HELLO: {
                 /* dest_nid is really ptl_magicversion_t */
                 ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid;
 
-                CERROR (LPU64": Dropping unexpected HELLO message: "
+                mv->magic = le32_to_cpu(mv->magic);
+                mv->version_major = le16_to_cpu(mv->version_major);
+                mv->version_minor = le16_to_cpu(mv->version_minor);
+
+                if (mv->magic == PORTALS_PROTO_MAGIC &&
+                    mv->version_major == PORTALS_PROTO_VERSION_MAJOR &&
+                    mv->version_minor == PORTALS_PROTO_VERSION_MINOR) {
+                        CWARN (LPU64": Dropping unexpected HELLO message: "
+                               "magic %d, version %d.%d from "LPD64"\n",
+                               nal->libnal_ni.ni_pid.nid, mv->magic, 
+                               mv->version_major, mv->version_minor,
+                               hdr->src_nid);
+
+                        /* it's good but we don't want it */
+                        lib_drop_message(nal, private, hdr);
+                        return PTL_OK;
+                }
+
+                /* we got garbage */
+                CERROR (LPU64": Bad HELLO message: "
                         "magic %d, version %d.%d from "LPD64"\n",
-                        nal->ni.nid, mv->magic, 
+                        nal->libnal_ni.ni_pid.nid, mv->magic, 
                         mv->version_major, mv->version_minor,
                         hdr->src_nid);
-                lib_drop_message(nal, private, hdr);
-                return;
+                return PTL_FAIL;
         }
-        
-        if (hdr->dest_nid != nal->ni.nid) {
-                CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64
-                       " (not me)\n", nal->ni.nid, hdr_type_string (hdr),
-                       hdr->src_nid, hdr->dest_nid);
-                lib_drop_message(nal, private, hdr);
-                return;
+
+        case PTL_MSG_ACK:
+        case PTL_MSG_PUT:
+        case PTL_MSG_GET:
+        case PTL_MSG_REPLY:
+                hdr->dest_nid = le64_to_cpu(hdr->dest_nid);
+                if (hdr->dest_nid != nal->libnal_ni.ni_pid.nid) {
+                        CERROR(LPU64": BAD dest NID in %s message from"
+                               LPU64" to "LPU64" (not me)\n", 
+                               nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr),
+                               hdr->src_nid, hdr->dest_nid);
+                        return PTL_FAIL;
+                }
+                break;
+
+        default:
+                CERROR(LPU64": Bad message type 0x%x from "LPU64"\n",
+                       nal->libnal_ni.ni_pid.nid, hdr->type, hdr->src_nid);
+                return PTL_FAIL;
         }
 
-        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+        /* We've decided we're not receiving garbage since we can parse the
+         * header.  We will return PTL_OK come what may... */
+
+        if (!list_empty (&nal->libnal_ni.ni_test_peers) && /* normally we don't */
             fail_peer (nal, hdr->src_nid, 0))      /* shall we now? */
         {
                 CERROR(LPU64": Dropping incoming %s from "LPU64
                        ": simulated failure\n",
-                       nal->ni.nid, hdr_type_string (hdr), 
+                       nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), 
                        hdr->src_nid);
                 lib_drop_message(nal, private, hdr);
-                return;
+                return PTL_OK;
         }
 
         msg = lib_msg_alloc(nal);
         if (msg == NULL) {
                 CERROR(LPU64": Dropping incoming %s from "LPU64
                        ": can't allocate a lib_msg_t\n",
-                       nal->ni.nid, hdr_type_string (hdr), 
+                       nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), 
                        hdr->src_nid);
                 lib_drop_message(nal, private, hdr);
-                return;
+                return PTL_OK;
         }
 
-        do_gettimeofday(&msg->ev.arrival_time);
-
         switch (hdr->type) {
         case PTL_MSG_ACK:
                 rc = parse_ack(nal, hdr, private, msg);
@@ -1060,10 +1064,8 @@ lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private)
                 rc = parse_reply(nal, hdr, private, msg);
                 break;
         default:
-                CERROR(LPU64": Dropping <unknown> message from "LPU64
-                       ": Bad type=0x%x\n",  nal->ni.nid, hdr->src_nid,
-                       hdr->type);
-                rc = PTL_FAIL;
+                LASSERT(0);
+                rc = PTL_FAIL;                  /* no compiler warning please */
                 break;
         }
                 
@@ -1072,138 +1074,129 @@ lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private)
                         /* committed... */
                         lib_finalize(nal, private, msg, rc);
                 } else {
-                        state_lock(nal, &flags);
-                        lib_msg_free(nal, msg); /* expects state_lock held */
-                        state_unlock(nal, &flags);
+                        LIB_LOCK(nal, flags);
+                        lib_msg_free(nal, msg); /* expects LIB_LOCK held */
+                        LIB_UNLOCK(nal, flags);
 
                         lib_drop_message(nal, private, hdr);
                 }
         }
+
+        return PTL_OK;
+        /* That's "OK I can parse it", not "OK I like it" :) */
 }
 
 int 
-do_PtlPut(nal_cb_t *nal, void *private, void *v_args, void *v_ret)
+lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, 
+            ptl_ack_req_t ack, ptl_process_id_t *id,
+            ptl_pt_index_t portal, ptl_ac_index_t ac,
+            ptl_match_bits_t match_bits, 
+            ptl_size_t offset, ptl_hdr_data_t hdr_data)
 {
-        /*
-         * Incoming:
-         *      ptl_handle_md_t md_in
-         *      ptl_ack_req_t ack_req_in
-         *      ptl_process_id_t target_in
-         *      ptl_pt_index_t portal_in
-         *      ptl_ac_index_t cookie_in
-         *      ptl_match_bits_t match_bits_in
-         *      ptl_size_t offset_in
-         *
-         * Outgoing:
-         */
-
-        PtlPut_in        *args = v_args;
-        ptl_process_id_t *id = &args->target_in;
-        PtlPut_out       *ret = v_ret;
-        lib_ni_t         *ni = &nal->ni;
+        lib_nal_t        *nal = apinal->nal_data;
+        lib_ni_t         *ni = &nal->libnal_ni;
         lib_msg_t        *msg;
         ptl_hdr_t         hdr;
         lib_md_t         *md;
         unsigned long     flags;
         int               rc;
         
-        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+        if (!list_empty (&ni->ni_test_peers) && /* normally we don't */
             fail_peer (nal, id->nid, 1))           /* shall we now? */
         {
-                CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n",
-                       nal->ni.nid, id->nid);
-                return (ret->rc = PTL_INV_PROC);
+                CERROR("Dropping PUT to "LPU64": simulated failure\n",
+                       id->nid);
+                return PTL_PROCESS_INVALID;
         }
 
         msg = lib_msg_alloc(nal);
         if (msg == NULL) {
                 CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n",
-                       ni->nid, id->nid);
-                return (ret->rc = PTL_NOSPACE);
+                       ni->ni_pid.nid, id->nid);
+                return PTL_NO_SPACE;
         }
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        md = ptl_handle2md(&args->md_in, nal);
+        md = ptl_handle2md(mdh, nal);
         if (md == NULL || md->threshold == 0) {
                 lib_msg_free(nal, msg);
-                state_unlock(nal, &flags);
+                LIB_UNLOCK(nal, flags);
         
-                return (ret->rc = PTL_INV_MD);
+                return PTL_MD_INVALID;
         }
 
-        CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid,
-               (unsigned long)id->pid);
+        CDEBUG(D_NET, "PtlPut -> "LPX64"\n", id->nid);
 
         memset (&hdr, 0, sizeof (hdr));
-        hdr.type     = HTON__u32 (PTL_MSG_PUT);
-        hdr.dest_nid = HTON__u64 (id->nid);
-        hdr.src_nid  = HTON__u64 (ni->nid);
-        hdr.dest_pid = HTON__u32 (id->pid);
-        hdr.src_pid  = HTON__u32 (ni->pid);
-        hdr.payload_length = HTON__u32 (md->length);
+        hdr.type     = cpu_to_le32(PTL_MSG_PUT);
+        hdr.dest_nid = cpu_to_le64(id->nid);
+        hdr.dest_pid = cpu_to_le32(id->pid);
+        hdr.src_nid  = cpu_to_le64(ni->ni_pid.nid);
+        hdr.src_pid  = cpu_to_le32(ni->ni_pid.pid);
+        hdr.payload_length = cpu_to_le32(md->length);
 
         /* NB handles only looked up by creator (no flips) */
-        if (args->ack_req_in == PTL_ACK_REQ) {
+        if (ack == PTL_ACK_REQ) {
                 hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie;
                 hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie;
         } else {
                 hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE;
         }
 
-        hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in);
-        hdr.msg.put.ptl_index = HTON__u32 (args->portal_in);
-        hdr.msg.put.offset = HTON__u32 (args->offset_in);
-        hdr.msg.put.hdr_data = args->hdr_data_in;
+        hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+        hdr.msg.put.ptl_index = cpu_to_le32(portal);
+        hdr.msg.put.offset = cpu_to_le32(offset);
+        hdr.msg.put.hdr_data = hdr_data;
 
         lib_commit_md(nal, md, msg);
         
-        msg->ev.type = PTL_EVENT_SENT;
-        msg->ev.initiator.nid = ni->nid;
-        msg->ev.initiator.pid = ni->pid;
-        msg->ev.portal = args->portal_in;
-        msg->ev.match_bits = args->match_bits_in;
+        msg->ev.type = PTL_EVENT_SEND_END;
+        msg->ev.initiator.nid = ni->ni_pid.nid;
+        msg->ev.initiator.pid = ni->ni_pid.pid;
+        msg->ev.pt_index = portal;
+        msg->ev.match_bits = match_bits;
         msg->ev.rlength = md->length;
         msg->ev.mlength = md->length;
-        msg->ev.offset = args->offset_in;
-        msg->ev.hdr_data = args->hdr_data_in;
+        msg->ev.offset = offset;
+        msg->ev.hdr_data = hdr_data;
 
-        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        lib_md_deconstruct(nal, md, &msg->ev.md);
+        ptl_md2handle(&msg->ev.md_handle, nal, md);
 
-        ni->counters.send_count++;
-        ni->counters.send_length += md->length;
+        ni->ni_counters.send_count++;
+        ni->ni_counters.send_length += md->length;
 
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
         
-        rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT,
+        rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_PUT,
                        id->nid, id->pid, md, 0, md->length);
         if (rc != PTL_OK) {
-                CERROR(LPU64": error sending PUT to "LPU64": %d\n",
-                       ni->nid, id->nid, rc);
-                lib_finalize (nal, private, msg, rc);
+                CERROR("Error sending PUT to "LPX64": %d\n",
+                       id->nid, rc);
+                lib_finalize (nal, NULL, msg, rc);
         }
         
         /* completion will be signalled by an event */
-        return ret->rc = PTL_OK;
+        return PTL_OK;
 }
 
 lib_msg_t * 
-lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd)
+lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg)
 {
         /* The NAL can DMA direct to the GET md (i.e. no REPLY msg).  This
-         * returns a msg the NAL can pass to lib_finalize() so that a REPLY
-         * event still occurs. 
+         * returns a msg for the NAL to pass to lib_finalize() when the sink
+         * data has been received.
          *
-         * CAVEAT EMPTOR: 'getmd' is passed by pointer so it MUST be valid.
-         * This can only be guaranteed while a lib_msg_t holds a reference
-         * on it (ie. pending > 0), so best call this before the
-         * lib_finalize() of the original GET. */
+         * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+         * lib_finalize() is called on it, so the NAL must call this first */
 
-        lib_ni_t        *ni = &nal->ni;
+        lib_ni_t        *ni = &nal->libnal_ni;
         lib_msg_t       *msg = lib_msg_alloc(nal);
+        lib_md_t        *getmd = getmsg->md;
         unsigned long    flags;
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
         LASSERT (getmd->pending > 0);
 
@@ -1225,143 +1218,132 @@ lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd)
 
         lib_commit_md (nal, getmd, msg);
 
-        msg->ev.type = PTL_EVENT_REPLY;
+        msg->ev.type = PTL_EVENT_REPLY_END;
         msg->ev.initiator.nid = peer_nid;
         msg->ev.initiator.pid = 0;      /* XXX FIXME!!! */
         msg->ev.rlength = msg->ev.mlength = getmd->length;
         msg->ev.offset = 0;
 
-        lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc);
+        lib_md_deconstruct(nal, getmd, &msg->ev.md);
+        ptl_md2handle(&msg->ev.md_handle, nal, getmd);
 
-        ni->counters.recv_count++;
-        ni->counters.recv_length += getmd->length;
+        ni->ni_counters.recv_count++;
+        ni->ni_counters.recv_length += getmd->length;
 
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
 
         return msg;
 
  drop_msg:
         lib_msg_free(nal, msg);
  drop:
-        nal->ni.counters.drop_count++;
-        nal->ni.counters.drop_length += getmd->length;
+        nal->libnal_ni.ni_counters.drop_count++;
+        nal->libnal_ni.ni_counters.drop_length += getmd->length;
 
-        state_unlock (nal, &flags);
+        LIB_UNLOCK (nal, flags);
 
         return NULL;
 }
 
 int 
-do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret)
+lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, ptl_process_id_t *id,
+            ptl_pt_index_t portal, ptl_ac_index_t ac,
+            ptl_match_bits_t match_bits, ptl_size_t offset)
 {
-        /*
-         * Incoming:
-         *      ptl_handle_md_t md_in
-         *      ptl_process_id_t target_in
-         *      ptl_pt_index_t portal_in
-         *      ptl_ac_index_t cookie_in
-         *      ptl_match_bits_t match_bits_in
-         *      ptl_size_t offset_in
-         *
-         * Outgoing:
-         */
-
-        PtlGet_in        *args = v_args;
-        ptl_process_id_t *id = &args->target_in;
-        PtlGet_out       *ret = v_ret;
-        lib_ni_t         *ni = &nal->ni;
+        lib_nal_t        *nal = apinal->nal_data;
+        lib_ni_t         *ni = &nal->libnal_ni;
         lib_msg_t        *msg;
         ptl_hdr_t         hdr;
         lib_md_t         *md;
         unsigned long     flags;
         int               rc;
         
-        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+        if (!list_empty (&ni->ni_test_peers) && /* normally we don't */
             fail_peer (nal, id->nid, 1))           /* shall we now? */
         {
-                CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n",
-                       nal->ni.nid, id->nid);
-                return (ret->rc = PTL_INV_PROC);
+                CERROR("Dropping PUT to "LPX64": simulated failure\n",
+                       id->nid);
+                return PTL_PROCESS_INVALID;
         }
 
         msg = lib_msg_alloc(nal);
         if (msg == NULL) {
-                CERROR(LPU64": Dropping GET to "LPU64": ENOMEM on lib_msg_t\n",
-                       ni->nid, id->nid);
-                return (ret->rc = PTL_NOSPACE);
+                CERROR("Dropping GET to "LPU64": ENOMEM on lib_msg_t\n",
+                       id->nid);
+                return PTL_NO_SPACE;
         }
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
-        md = ptl_handle2md(&args->md_in, nal);
+        md = ptl_handle2md(mdh, nal);
         if (md == NULL || !md->threshold) {
                 lib_msg_free(nal, msg);
-                state_unlock(nal, &flags);
+                LIB_UNLOCK(nal, flags);
 
-                return ret->rc = PTL_INV_MD;
+                return PTL_MD_INVALID;
         }
 
         CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid,
                (unsigned long)id->pid);
 
         memset (&hdr, 0, sizeof (hdr));
-        hdr.type     = HTON__u32 (PTL_MSG_GET);
-        hdr.dest_nid = HTON__u64 (id->nid);
-        hdr.src_nid  = HTON__u64 (ni->nid);
-        hdr.dest_pid = HTON__u32 (id->pid);
-        hdr.src_pid  = HTON__u32 (ni->pid);
+        hdr.type     = cpu_to_le32(PTL_MSG_GET);
+        hdr.dest_nid = cpu_to_le64(id->nid);
+        hdr.dest_pid = cpu_to_le32(id->pid);
+        hdr.src_nid  = cpu_to_le64(ni->ni_pid.nid);
+        hdr.src_pid  = cpu_to_le32(ni->ni_pid.pid);
         hdr.payload_length = 0;
 
         /* NB handles only looked up by creator (no flips) */
         hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie;
         hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie;
 
-        hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in);
-        hdr.msg.get.ptl_index = HTON__u32 (args->portal_in);
-        hdr.msg.get.src_offset = HTON__u32 (args->offset_in);
-        hdr.msg.get.sink_length = HTON__u32 (md->length);
+        hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+        hdr.msg.get.ptl_index = cpu_to_le32(portal);
+        hdr.msg.get.src_offset = cpu_to_le32(offset);
+        hdr.msg.get.sink_length = cpu_to_le32(md->length);
 
         lib_commit_md(nal, md, msg);
 
-        msg->ev.type = PTL_EVENT_SENT;
-        msg->ev.initiator.nid = ni->nid;
-        msg->ev.initiator.pid = ni->pid;
-        msg->ev.portal = args->portal_in;
-        msg->ev.match_bits = args->match_bits_in;
+        msg->ev.type = PTL_EVENT_SEND_END;
+        msg->ev.initiator = ni->ni_pid;
+        msg->ev.pt_index = portal;
+        msg->ev.match_bits = match_bits;
         msg->ev.rlength = md->length;
         msg->ev.mlength = md->length;
-        msg->ev.offset = args->offset_in;
+        msg->ev.offset = offset;
         msg->ev.hdr_data = 0;
 
-        lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        lib_md_deconstruct(nal, md, &msg->ev.md);
+        ptl_md2handle(&msg->ev.md_handle, nal, md);
 
-        ni->counters.send_count++;
+        ni->ni_counters.send_count++;
 
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
 
-        rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET,
+        rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_GET,
                        id->nid, id->pid, NULL, 0, 0);
         if (rc != PTL_OK) {
                 CERROR(LPU64": error sending GET to "LPU64": %d\n",
-                       ni->nid, id->nid, rc);
-                lib_finalize (nal, private, msg, rc);
+                       ni->ni_pid.nid, id->nid, rc);
+                lib_finalize (nal, NULL, msg, rc);
         }
         
         /* completion will be signalled by an event */
-        return ret->rc = PTL_OK;
+        return PTL_OK;
 }
 
 void lib_assert_wire_constants (void)
 {
         /* Wire protocol assertions generated by 'wirecheck'
-         * running on Linux robert.bartonsoftware.com 2.4.20-18.9 #1 Thu May 29 06:54:41 EDT 2003 i68
-         * with gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5) */
+         * running on Linux mdevi 2.4.21-p4smp-55chaos #1 SMP Tue Jun 8 14:38:44 PDT 2004 i686 i686 i
+         * with gcc version 3.2.3 20030502 (Red Hat Linux 3.2.3-34) */
 
 
         /* Constants... */
         LASSERT (PORTALS_PROTO_MAGIC == 0xeebc0ded);
-        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
-        LASSERT (PORTALS_PROTO_VERSION_MINOR == 3);
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 1);
+        LASSERT (PORTALS_PROTO_VERSION_MINOR == 0);
         LASSERT (PTL_MSG_ACK == 0);
         LASSERT (PTL_MSG_PUT == 1);
         LASSERT (PTL_MSG_GET == 2);
@@ -1370,76 +1352,76 @@ void lib_assert_wire_constants (void)
 
         /* Checks for struct ptl_handle_wire_t */
         LASSERT ((int)sizeof(ptl_handle_wire_t) == 16);
-        LASSERT (offsetof(ptl_handle_wire_t, wh_interface_cookie) == 0);
+        LASSERT ((int)offsetof(ptl_handle_wire_t, wh_interface_cookie) == 0);
         LASSERT ((int)sizeof(((ptl_handle_wire_t *)0)->wh_interface_cookie) == 8);
-        LASSERT (offsetof(ptl_handle_wire_t, wh_object_cookie) == 8);
+        LASSERT ((int)offsetof(ptl_handle_wire_t, wh_object_cookie) == 8);
         LASSERT ((int)sizeof(((ptl_handle_wire_t *)0)->wh_object_cookie) == 8);
 
         /* Checks for struct ptl_magicversion_t */
         LASSERT ((int)sizeof(ptl_magicversion_t) == 8);
-        LASSERT (offsetof(ptl_magicversion_t, magic) == 0);
+        LASSERT ((int)offsetof(ptl_magicversion_t, magic) == 0);
         LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->magic) == 4);
-        LASSERT (offsetof(ptl_magicversion_t, version_major) == 4);
+        LASSERT ((int)offsetof(ptl_magicversion_t, version_major) == 4);
         LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->version_major) == 2);
-        LASSERT (offsetof(ptl_magicversion_t, version_minor) == 6);
+        LASSERT ((int)offsetof(ptl_magicversion_t, version_minor) == 6);
         LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->version_minor) == 2);
 
         /* Checks for struct ptl_hdr_t */
         LASSERT ((int)sizeof(ptl_hdr_t) == 72);
-        LASSERT (offsetof(ptl_hdr_t, dest_nid) == 0);
+        LASSERT ((int)offsetof(ptl_hdr_t, dest_nid) == 0);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->dest_nid) == 8);
-        LASSERT (offsetof(ptl_hdr_t, src_nid) == 8);
+        LASSERT ((int)offsetof(ptl_hdr_t, src_nid) == 8);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->src_nid) == 8);
-        LASSERT (offsetof(ptl_hdr_t, dest_pid) == 16);
+        LASSERT ((int)offsetof(ptl_hdr_t, dest_pid) == 16);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->dest_pid) == 4);
-        LASSERT (offsetof(ptl_hdr_t, src_pid) == 20);
+        LASSERT ((int)offsetof(ptl_hdr_t, src_pid) == 20);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->src_pid) == 4);
-        LASSERT (offsetof(ptl_hdr_t, type) == 24);
+        LASSERT ((int)offsetof(ptl_hdr_t, type) == 24);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->type) == 4);
-        LASSERT (offsetof(ptl_hdr_t, payload_length) == 28);
+        LASSERT ((int)offsetof(ptl_hdr_t, payload_length) == 28);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->payload_length) == 4);
-        LASSERT (offsetof(ptl_hdr_t, msg) == 32);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg) == 32);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg) == 40);
 
         /* Ack */
-        LASSERT (offsetof(ptl_hdr_t, msg.ack.dst_wmd) == 32);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.dst_wmd) == 32);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.dst_wmd) == 16);
-        LASSERT (offsetof(ptl_hdr_t, msg.ack.match_bits) == 48);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.match_bits) == 48);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.match_bits) == 8);
-        LASSERT (offsetof(ptl_hdr_t, msg.ack.mlength) == 56);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.mlength) == 56);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.mlength) == 4);
 
         /* Put */
-        LASSERT (offsetof(ptl_hdr_t, msg.put.ack_wmd) == 32);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.put.ack_wmd) == 32);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.ack_wmd) == 16);
-        LASSERT (offsetof(ptl_hdr_t, msg.put.match_bits) == 48);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.put.match_bits) == 48);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.match_bits) == 8);
-        LASSERT (offsetof(ptl_hdr_t, msg.put.hdr_data) == 56);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.put.hdr_data) == 56);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.hdr_data) == 8);
-        LASSERT (offsetof(ptl_hdr_t, msg.put.ptl_index) == 64);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.put.ptl_index) == 64);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.ptl_index) == 4);
-        LASSERT (offsetof(ptl_hdr_t, msg.put.offset) == 68);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.put.offset) == 68);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.offset) == 4);
 
         /* Get */
-        LASSERT (offsetof(ptl_hdr_t, msg.get.return_wmd) == 32);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.get.return_wmd) == 32);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.return_wmd) == 16);
-        LASSERT (offsetof(ptl_hdr_t, msg.get.match_bits) == 48);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.get.match_bits) == 48);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.match_bits) == 8);
-        LASSERT (offsetof(ptl_hdr_t, msg.get.ptl_index) == 56);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.get.ptl_index) == 56);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.ptl_index) == 4);
-        LASSERT (offsetof(ptl_hdr_t, msg.get.src_offset) == 60);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.get.src_offset) == 60);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.src_offset) == 4);
-        LASSERT (offsetof(ptl_hdr_t, msg.get.sink_length) == 64);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.get.sink_length) == 64);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.sink_length) == 4);
 
         /* Reply */
-        LASSERT (offsetof(ptl_hdr_t, msg.reply.dst_wmd) == 32);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.reply.dst_wmd) == 32);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.reply.dst_wmd) == 16);
 
         /* Hello */
-        LASSERT (offsetof(ptl_hdr_t, msg.hello.incarnation) == 32);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.hello.incarnation) == 32);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.hello.incarnation) == 8);
-        LASSERT (offsetof(ptl_hdr_t, msg.hello.type) == 40);
+        LASSERT ((int)offsetof(ptl_hdr_t, msg.hello.type) == 40);
         LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.hello.type) == 4);
 }
index 04c69b1..54e89bc 100644 (file)
 #include <portals/lib-p30.h>
 
 void
-lib_enq_event_locked (nal_cb_t *nal, void *private, 
+lib_enq_event_locked (lib_nal_t *nal, void *private, 
                       lib_eq_t *eq, ptl_event_t *ev)
 {
         ptl_event_t  *eq_slot;
-        int           rc;
-        
-        ev->sequence = eq->sequence++; /* Allocate the next queue slot */
-
-        /* size must be a power of 2 to handle a wrapped sequence # */
-        LASSERT (eq->size != 0 &&
-                 eq->size == LOWEST_BIT_SET (eq->size));
-        eq_slot = eq->base + (ev->sequence & (eq->size - 1));
-
-        /* Copy the event into the allocated slot, ensuring all the rest of
-         * the event's contents have been copied _before_ the sequence
-         * number gets updated.  A processes 'getting' an event waits on
-         * the next queue slot's sequence to be 'new'.  When it is, _all_
-         * other event fields had better be consistent.  I assert
-         * 'sequence' is the last member, so I only need a 2 stage copy. */
 
-        LASSERT(sizeof (ptl_event_t) ==
-                offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
-
-        rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
-                            offsetof (ptl_event_t, sequence));
-        LASSERT (rc == PTL_OK);
-
-#ifdef __KERNEL__
-        barrier();
-#endif
-        /* Updating the sequence number is what makes the event 'new' NB if
-         * the cb_write below isn't atomic, this could cause a race with
-         * PtlEQGet */
-        rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
-                           (void *)&ev->sequence,sizeof (ev->sequence));
-        LASSERT (rc == PTL_OK);
+        /* Allocate the next queue slot */
+        ev->link = ev->sequence = eq->eq_enq_seq++;
+        /* NB we don't support START events yet and we don't create a separate
+         * UNLINK event unless an explicit unlink succeeds, so the link
+         * sequence is pretty useless */
 
+        /* We don't support different uid/jids yet */
+        ev->uid = 0;
+        ev->jid = 0;
+        
+        /* size must be a power of 2 to handle sequence # overflow */
+        LASSERT (eq->eq_size != 0 &&
+                 eq->eq_size == LOWEST_BIT_SET (eq->eq_size));
+        eq_slot = eq->eq_events + (ev->sequence & (eq->eq_size - 1));
+
+        /* There is no race since both event consumers and event producers
+         * take the LIB_LOCK(), so we don't screw around with memory
+         * barriers, setting the sequence number last or wierd structure
+         * layout assertions. */
+        *eq_slot = *ev;
+
+        /* Call the callback handler (if any) */
+        if (eq->eq_callback != NULL)
+                eq->eq_callback (eq_slot);
+
+        /* Wake anyone sleeping for an event (see lib-eq.c) */
 #ifdef __KERNEL__
-        barrier();
+        if (waitqueue_active(&nal->libnal_ni.ni_waitq))
+                wake_up_all(&nal->libnal_ni.ni_waitq);
+#else
+        pthread_cond_broadcast(&nal->libnal_ni.ni_cond);
 #endif
-
-        if (nal->cb_callback != NULL)
-                nal->cb_callback(nal, private, eq, ev);
-        else if (eq->event_callback != NULL)
-                eq->event_callback(ev);
 }
 
 void 
-lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
+lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
 {
         lib_md_t     *md;
         int           unlink;
@@ -89,10 +81,6 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
         int           rc;
         ptl_hdr_t     ack;
 
-        /* ni went down while processing this message */
-        if (nal->ni.up == 0)
-                return;
-
         if (msg == NULL)
                 return;
 
@@ -100,19 +88,19 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
         if (status == PTL_OK &&
             !ptl_is_wire_handle_none(&msg->ack_wmd)) {
 
-                LASSERT(msg->ev.type == PTL_EVENT_PUT);
+                LASSERT(msg->ev.type == PTL_EVENT_PUT_END);
 
                 memset (&ack, 0, sizeof (ack));
-                ack.type     = HTON__u32 (PTL_MSG_ACK);
-                ack.dest_nid = HTON__u64 (msg->ev.initiator.nid);
-                ack.src_nid  = HTON__u64 (nal->ni.nid);
-                ack.dest_pid = HTON__u32 (msg->ev.initiator.pid);
-                ack.src_pid  = HTON__u32 (nal->ni.pid);
+                ack.type     = cpu_to_le32(PTL_MSG_ACK);
+                ack.dest_nid = cpu_to_le64(msg->ev.initiator.nid);
+                ack.dest_pid = cpu_to_le32(msg->ev.initiator.pid);
+                ack.src_nid  = cpu_to_le64(nal->libnal_ni.ni_pid.nid);
+                ack.src_pid  = cpu_to_le32(nal->libnal_ni.ni_pid.pid);
                 ack.payload_length = 0;
 
                 ack.msg.ack.dst_wmd = msg->ack_wmd;
                 ack.msg.ack.match_bits = msg->ev.match_bits;
-                ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength);
+                ack.msg.ack.mlength = cpu_to_le32(msg->ev.mlength);
 
                 rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK,
                                msg->ev.initiator.nid, msg->ev.initiator.pid, 
@@ -126,18 +114,23 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
 
         md = msg->md;
 
-        state_lock(nal, &flags);
+        LIB_LOCK(nal, flags);
 
         /* Now it's safe to drop my caller's ref */
         md->pending--;
         LASSERT (md->pending >= 0);
 
         /* Should I unlink this MD? */
-        unlink = (md->pending == 0 &&           /* No other refs */
-                  (md->threshold == 0 ||        /* All ops done */
-                   md->md_flags & PTL_MD_FLAG_UNLINK) != 0); /* black spot */
-
-        msg->ev.status = status;
+        if (md->pending != 0)                   /* other refs */
+                unlink = 0;
+        else if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) != 0)
+                unlink = 1;
+        else if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) == 0)
+                unlink = 0;
+        else
+                unlink = lib_md_exhausted(md);
+
+        msg->ev.ni_fail_type = status;
         msg->ev.unlinked = unlink;
 
         if (md->eq != NULL)
@@ -147,8 +140,8 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
                 lib_md_unlink(nal, md);
 
         list_del (&msg->msg_list);
-        nal->ni.counters.msgs_alloc--;
+        nal->libnal_ni.ni_counters.msgs_alloc--;
         lib_msg_free(nal, msg);
 
-        state_unlock(nal, &flags);
+        LIB_UNLOCK(nal, flags);
 }
index 9e90576..0f298a0 100644 (file)
 
 #define DEBUG_SUBSYSTEM S_PORTALS
 #include <portals/lib-p30.h>
-#include <portals/arg-blocks.h>
 
 #define MAX_DIST 18446744073709551615ULL
 
-int do_PtlNIDebug(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int lib_api_ni_status (nal_t *apinal, ptl_sr_index_t sr_idx,
+                       ptl_sr_value_t *status)
 {
-        PtlNIDebug_in *args = v_args;
-        PtlNIDebug_out *ret = v_ret;
-        lib_ni_t *ni = &nal->ni;
-
-        ret->rc = ni->debug;
-        ni->debug = args->mask_in;
-
-        return 0;
-}
-
-int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
-{
-        /*
-         * Incoming:
-         *      ptl_handle_ni_t interface_in
-         *      ptl_sr_index_t register_in
-         *
-         * Outgoing:
-         *      ptl_sr_value_t          * status_out
-         */
-
-        PtlNIStatus_in *args = v_args;
-        PtlNIStatus_out *ret = v_ret;
-        lib_ni_t *ni = &nal->ni;
-        lib_counters_t *count = &ni->counters;
-
-        if (!args)
-                return ret->rc = PTL_SEGV;
-
-        ret->rc = PTL_OK;
-        ret->status_out = 0;
-
-        /*
-         * I hate this sort of code....  Hash tables, offset lists?
-         * Treat the counters as an array of ints?
-         */
-        if (args->register_in == PTL_SR_DROP_COUNT)
-                ret->status_out = count->drop_count;
-
-        else if (args->register_in == PTL_SR_DROP_LENGTH)
-                ret->status_out = count->drop_length;
-
-        else if (args->register_in == PTL_SR_RECV_COUNT)
-                ret->status_out = count->recv_count;
-
-        else if (args->register_in == PTL_SR_RECV_LENGTH)
-                ret->status_out = count->recv_length;
-
-        else if (args->register_in == PTL_SR_SEND_COUNT)
-                ret->status_out = count->send_count;
-
-        else if (args->register_in == PTL_SR_SEND_LENGTH)
-                ret->status_out = count->send_length;
-
-        else if (args->register_in == PTL_SR_MSGS_MAX)
-                ret->status_out = count->msgs_max;
-        else
-                ret->rc = PTL_INV_SR_INDX;
-
-        return ret->rc;
+        lib_nal_t      *nal = apinal->nal_data;
+        lib_ni_t       *ni = &nal->libnal_ni;
+        lib_counters_t *count = &ni->ni_counters;
+
+        switch (sr_idx) {
+        case PTL_SR_DROP_COUNT:
+                *status = count->drop_count;
+                return PTL_OK;
+        case PTL_SR_DROP_LENGTH:
+                *status = count->drop_length;
+                return PTL_OK;
+        case PTL_SR_RECV_COUNT:
+                *status = count->recv_count;
+                return PTL_OK;
+        case PTL_SR_RECV_LENGTH:
+                *status = count->recv_length;
+                return PTL_OK;
+        case PTL_SR_SEND_COUNT:
+                *status = count->send_count;
+                return PTL_OK;
+        case PTL_SR_SEND_LENGTH:
+                *status = count->send_length;
+                return PTL_OK;
+        case PTL_SR_MSGS_MAX:
+                *status = count->msgs_max;
+                return PTL_OK;
+        default:
+                *status = 0;
+                return PTL_SR_INDEX_INVALID;
+        }
 }
 
 
-int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int lib_api_ni_dist (nal_t *apinal, ptl_process_id_t *pid, unsigned long *dist)
 {
-        /*
-         * Incoming:
-         *      ptl_handle_ni_t interface_in
-         *      ptl_process_id_t process_in
-
-         *
-         * Outgoing:
-         *      unsigned long   * distance_out
-
-         */
-
-        PtlNIDist_in *args = v_args;
-        PtlNIDist_out *ret = v_ret;
-
-        unsigned long dist;
-        ptl_process_id_t id_in = args->process_in;
-        ptl_nid_t nid;
-        int rc;
-
-        nid = id_in.nid;
-
-        if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) {
-                ret->distance_out = (unsigned long) MAX_DIST;
-                return PTL_INV_PROC;
-        }
-
-        ret->distance_out = dist;
+        lib_nal_t *nal = apinal->nal_data;
 
-        return ret->rc = PTL_OK;
+        return (nal->libnal_dist(nal, pid->nid, dist));
 }
index 12eebb5..ff2a601 100644 (file)
@@ -35,24 +35,12 @@ extern int getpid(void);
 #       include <unistd.h>
 #endif
 #include <portals/lib-p30.h>
-#include <portals/arg-blocks.h>
 
-int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int
+lib_api_get_id(nal_t *apinal, ptl_process_id_t *pid)
 {
-        /*
-         * Incoming:
-         *      ptl_handle_ni_t handle_in
-         *
-         * Outgoing:
-         *      ptl_process_id_t        * id_out
-         *      ptl_id_t                * gsize_out
-         */
-
-        PtlGetId_out *ret = v_ret;
-        lib_ni_t *ni = &nal->ni;
-
-        ret->id_out.nid = ni->nid;
-        ret->id_out.pid = ni->pid;
-
-        return ret->rc = PTL_OK;
+        lib_nal_t *nal = apinal->nal_data;
+        
+        *pid = nal->libnal_ni.ni_pid;
+        return PTL_OK;
 }
index eb41dfd..61ef372 100644 (file)
@@ -83,7 +83,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data,
 
                 CDEBUG (D_IOCTL, "Getting nid for nal [%d]\n", data->ioc_nal);
 
-                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih);
+                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
+                                NULL, &nih);
                 if (!(err == PTL_OK || err == PTL_IFACE_DUP))
                         RETURN (-EINVAL);
 
@@ -104,7 +105,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data,
                 CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
                         data->ioc_nal, data->ioc_nid, data->ioc_count);
 
-                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih);
+                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
+                                NULL, &nih);
                 if (!(err == PTL_OK || err == PTL_IFACE_DUP))
                         return (-EINVAL);
 
index ad4dd87..a1397d2 100644 (file)
@@ -42,7 +42,7 @@ struct name2num {
         { "elan",        QSWNAL},
         { "tcp",         SOCKNAL},
         { "gm",          GMNAL},
-        { "ib",          IBNAL},
+        { "ib",          OPENIBNAL},
         { NULL,          -1}
 };
 
index 6fcd83a..448ab1f 100644 (file)
@@ -49,13 +49,6 @@ kpr_router_interface_t kpr_router_interface = {
        kprri_deregister:       kpr_deregister_nal,
 };
 
-kpr_control_interface_t kpr_control_interface = {
-       kprci_add_route:        kpr_add_route,
-       kprci_del_route:        kpr_del_route,
-       kprci_get_route:        kpr_get_route,
-       kprci_notify:           kpr_sys_notify,
-};
-
 int
 kpr_register_nal (kpr_nal_interface_t *nalif, void **argp)
 {
@@ -290,18 +283,9 @@ kpr_shutdown_nal (void *arg)
        LASSERT (!ne->kpne_shutdown);
        LASSERT (!in_interrupt());
 
-       write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */
+       write_lock_irqsave (&kpr_rwlock, flags);
        ne->kpne_shutdown = 1;
-       write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */
-
-       while (atomic_read (&ne->kpne_refcount) != 0)
-       {
-               CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n",
-                       ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount));
-
-               set_current_state (TASK_UNINTERRUPTIBLE);
-               schedule_timeout (HZ);
-       }
+       write_unlock_irqrestore (&kpr_rwlock, flags);
 }
 
 void
@@ -313,15 +297,22 @@ kpr_deregister_nal (void *arg)
         CDEBUG (D_NET, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid);
 
        LASSERT (ne->kpne_shutdown);            /* caller must have issued shutdown already */
-       LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */
        LASSERT (!in_interrupt());
 
        write_lock_irqsave (&kpr_rwlock, flags);
-
        list_del (&ne->kpne_list);
-
        write_unlock_irqrestore (&kpr_rwlock, flags);
 
+        /* Wait until all outstanding messages/notifications have completed */
+       while (atomic_read (&ne->kpne_refcount) != 0)
+       {
+               CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n",
+                       ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount));
+
+               set_current_state (TASK_UNINTERRUPTIBLE);
+               schedule_timeout (HZ);
+       }
+
        PORTAL_FREE (ne, sizeof (*ne));
         PORTAL_MODULE_UNUSE;
 }
@@ -378,12 +369,15 @@ kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob,
 
         CDEBUG (D_NET, "lookup "LPX64" from NAL %d\n", target_nid, 
                 ne->kpne_interface.kprni_nalid);
-
-       if (ne->kpne_shutdown)          /* caller is shutting down */
-               return (-ENOENT);
+        LASSERT (!in_interrupt());
 
        read_lock (&kpr_rwlock);
 
+       if (ne->kpne_shutdown) {        /* caller is shutting down */
+                read_unlock (&kpr_rwlock);
+               return (-ENOENT);
+        }
+
        /* Search routes for one that has a gateway to target_nid on the callers network */
 
         list_for_each (e, &kpr_routes) {
@@ -453,25 +447,26 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
        struct list_head    *e;
         kpr_route_entry_t   *re;
         kpr_nal_entry_t     *tmp_ne;
+        int                  rc;
 
         CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d\n", fwd,
                 target_nid, src_ne->kpne_interface.kprni_nalid);
 
         LASSERT (nob == lib_kiov_nob (fwd->kprfd_niov, fwd->kprfd_kiov));
-        
-        atomic_inc (&kpr_queue_depth);
-       atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */
+        LASSERT (!in_interrupt());
+
+       read_lock (&kpr_rwlock);
 
         kpr_fwd_packets++;                   /* (loose) stats accounting */
         kpr_fwd_bytes += nob + sizeof(ptl_hdr_t);
 
-       if (src_ne->kpne_shutdown)           /* caller is shutting down */
+       if (src_ne->kpne_shutdown) {         /* caller is shutting down */
+                rc = -ESHUTDOWN;
                goto out;
+        }
 
        fwd->kprfd_router_arg = src_ne;      /* stash caller's nal entry */
 
-       read_lock (&kpr_rwlock);
-
        /* Search routes for one that has a gateway to target_nid NOT on the caller's network */
 
         list_for_each (e, &kpr_routes) {
@@ -508,7 +503,9 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
                 kpr_update_weight (ge, nob);
 
                 fwd->kprfd_gateway_nid = ge->kpge_nid;
-                atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */
+                atomic_inc (&src_ne->kpne_refcount); /* source and dest nals are */
+                atomic_inc (&dst_ne->kpne_refcount); /* busy until fwd completes */
+                atomic_inc (&kpr_queue_depth);
 
                 read_unlock (&kpr_rwlock);
 
@@ -521,18 +518,16 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
                 return;
        }
 
-        read_unlock (&kpr_rwlock);
+        rc = -EHOSTUNREACH;
  out:
         kpr_fwd_errors++;
 
-        CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd,
-                target_nid, src_ne->kpne_interface.kprni_nalid);
+        CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %d: %d\n", 
+                fwd, target_nid, src_ne->kpne_interface.kprni_nalid, rc);
 
-       /* Can't find anywhere to forward to */
-       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH);
+       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, rc);
 
-        atomic_dec (&kpr_queue_depth);
-       atomic_dec (&src_ne->kpne_refcount);
+        read_unlock (&kpr_rwlock);
 }
 
 void
@@ -635,7 +630,7 @@ kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid,
 
 int
 kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid,
-            int alive, time_t when)
+                int alive, time_t when)
 {
         return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when));
 }
@@ -695,11 +690,12 @@ kpr_del_route (int gw_nalid, ptl_nid_t gw_nid,
 }
 
 int
-kpr_get_route (int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
-               ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive)
+kpr_get_route (int idx, __u32 *gateway_nalid, ptl_nid_t *gateway_nid,
+               ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, __u32 *alive)
 {
        struct list_head  *e;
 
+        LASSERT (!in_interrupt());
        read_lock(&kpr_rwlock);
 
         for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
@@ -723,11 +719,67 @@ kpr_get_route (int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
         return (-ENOENT);
 }
 
+static int 
+kpr_nal_cmd(struct portals_cfg *pcfg, void * private)
+{
+        int err = -EINVAL;
+        ENTRY;
+
+        switch(pcfg->pcfg_command) {
+        default:
+                CDEBUG(D_IOCTL, "Inappropriate cmd: %d\n", pcfg->pcfg_command);
+                break;
+                
+        case NAL_CMD_ADD_ROUTE:
+                CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n",
+                       pcfg->pcfg_nal, pcfg->pcfg_nid, 
+                       pcfg->pcfg_nid2, pcfg->pcfg_nid3);
+                err = kpr_add_route(pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
+                                    pcfg->pcfg_nid2, pcfg->pcfg_nid3);
+                break;
+
+        case NAL_CMD_DEL_ROUTE:
+                CDEBUG (D_IOCTL, "Removing routes via [%d] "LPU64" : "LPU64" - "LPU64"\n",
+                        pcfg->pcfg_gw_nal, pcfg->pcfg_nid, 
+                        pcfg->pcfg_nid2, pcfg->pcfg_nid3);
+                err = kpr_del_route (pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
+                                     pcfg->pcfg_nid2, pcfg->pcfg_nid3);
+                break;
+
+        case NAL_CMD_NOTIFY_ROUTER: {
+                CDEBUG (D_IOCTL, "Notifying peer [%d] "LPU64" %s @ %ld\n",
+                        pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
+                        pcfg->pcfg_flags ? "Enabling" : "Disabling",
+                        (time_t)pcfg->pcfg_nid3);
+                
+                err = kpr_sys_notify (pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
+                                      pcfg->pcfg_flags, (time_t)pcfg->pcfg_nid3);
+                break;
+        }
+                
+        case NAL_CMD_GET_ROUTE:
+                CDEBUG (D_IOCTL, "Getting route [%d]\n", pcfg->pcfg_count);
+                err = kpr_get_route(pcfg->pcfg_count, &pcfg->pcfg_gw_nal,
+                                    &pcfg->pcfg_nid, 
+                                    &pcfg->pcfg_nid2, &pcfg->pcfg_nid3,
+                                    &pcfg->pcfg_flags);
+                break;
+        }
+        RETURN(err);
+}
+
+
 static void /*__exit*/
 kpr_finalise (void)
 {
         LASSERT (list_empty (&kpr_nals));
 
+        libcfs_nal_cmd_unregister(ROUTER);
+
+        PORTAL_SYMBOL_UNREGISTER(kpr_router_interface);
+
+        kpr_proc_fini();
+
         while (!list_empty (&kpr_routes)) {
                 kpr_route_entry_t *re = list_entry(kpr_routes.next,
                                                    kpr_route_entry_t,
@@ -737,11 +789,6 @@ kpr_finalise (void)
                 PORTAL_FREE(re, sizeof (*re));
         }
 
-        kpr_proc_fini();
-
-        PORTAL_SYMBOL_UNREGISTER(kpr_router_interface);
-        PORTAL_SYMBOL_UNREGISTER(kpr_control_interface);
-
         CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n",
                atomic_read(&portal_kmemory));
 }
@@ -749,14 +796,21 @@ kpr_finalise (void)
 static int __init
 kpr_initialise (void)
 {
+        int     rc;
+        
         CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n",
                atomic_read(&portal_kmemory));
 
         kpr_routes_generation = 0;
         kpr_proc_init();
 
+        rc = libcfs_nal_cmd_register(ROUTER, kpr_nal_cmd, NULL);
+        if (rc != 0) {
+                CERROR("Can't register nal cmd handler\n");
+                return (rc);
+        }
+        
         PORTAL_SYMBOL_REGISTER(kpr_router_interface);
-        PORTAL_SYMBOL_REGISTER(kpr_control_interface);
         return (0);
 }
 
@@ -767,5 +821,4 @@ MODULE_LICENSE("GPL");
 module_init (kpr_initialise);
 module_exit (kpr_finalise);
 
-EXPORT_SYMBOL (kpr_control_interface);
 EXPORT_SYMBOL (kpr_router_interface);
index 611d808..27e4983 100644 (file)
@@ -93,20 +93,12 @@ extern void kpr_deregister_nal (void *arg);
 extern void kpr_proc_init (void);
 extern void kpr_proc_fini (void);
 
-extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, 
-                          ptl_nid_t lo_nid, ptl_nid_t hi_nid);
-extern int kpr_del_route (int gw_nal, ptl_nid_t gw_nid,
-                          ptl_nid_t lo, ptl_nid_t hi);
-extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, 
-                          ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive);
-extern int kpr_sys_notify (int gw_nalid, ptl_nid_t gw_nid,
-                           int alive, time_t when);
-
 extern unsigned int       kpr_routes_generation;
 extern unsigned long long kpr_fwd_bytes;
 extern unsigned long      kpr_fwd_packets;
 extern unsigned long      kpr_fwd_errors;
 extern atomic_t           kpr_queue_depth;
+
 extern struct list_head   kpr_routes;
 extern rwlock_t           kpr_rwlock;
 
index 85c0d71..7a3f8a0 100644 (file)
@@ -46,7 +46,7 @@ static struct pingcli_data *client = NULL;
 static int count = 0;
 
 static void
-pingcli_shutdown(int err)
+pingcli_shutdown(ptl_handle_ni_t nih, int err)
 {
         int rc;
 
@@ -70,7 +70,7 @@ pingcli_shutdown(int err)
                         if ((rc = PtlMEUnlink (client->me)))
                                 PDEBUG ("PtlMEUnlink", rc);
                 case 3:
-                        kportal_put_ni (client->args->ioc_nal);
+                        PtlNIFini(nih);
 
                 case 4:
                         /* Free our buffers */
@@ -84,29 +84,27 @@ pingcli_shutdown(int err)
         CDEBUG (D_OTHER, "ping client released resources\n");
 } /* pingcli_shutdown() */
 
-static int pingcli_callback(ptl_event_t *ev)
+static void pingcli_callback(ptl_event_t *ev)
 {
         int i, magic;
-        i = *(int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned));
-        magic = *(int *)(ev->mem_desc.start + ev->offset);
+        i = *(int *)(ev->md.start + ev->offset + sizeof(unsigned));
+        magic = *(int *)(ev->md.start + ev->offset);
 
         if(magic != 0xcafebabe) {
-                printk ("LustreError: Unexpected response \n");
-                return 1;
+                CERROR("Unexpected response %x\n", magic);
         }
 
         if((i == count) || !count)
                 wake_up_process (client->tsk);
         else
-                printk ("LustreError: Received response after timeout for %d\n",i);
-        return 1;
+                CERROR("Received response after timeout for %d\n",i);
 }
 
 
 static struct pingcli_data *
 pingcli_start(struct portal_ioctl_data *args)
 {
-        ptl_handle_ni_t *nip;
+        ptl_handle_ni_t nih = PTL_INVALID_HANDLE;
         unsigned ping_head_magic = PING_HEADER_MAGIC;
         unsigned ping_bulk_magic = PING_BULK_MAGIC;
         int rc;
@@ -127,7 +125,7 @@ pingcli_start(struct portal_ioctl_data *args)
         if (client->outbuf == NULL)
         {
                 CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
-                pingcli_shutdown (4);
+                pingcli_shutdown (nih, 4);
                 return (NULL);
         }
 
@@ -136,23 +134,24 @@ pingcli_start(struct portal_ioctl_data *args)
         if (client->inbuf == NULL)
         {
                 CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
-                pingcli_shutdown (4);
+                pingcli_shutdown (nih, 4);
                 return (NULL);
         }
 
         /* Aquire and initialize the proper nal for portals. */
-        if ((nip = kportal_get_ni (args->ioc_nal)) == NULL)
+        rc = PtlNIInit(args->ioc_nal, 0, NULL, NULL, &nih);
+        if (rc != PTL_OK || rc != PTL_IFACE_DUP)
         {
                 CERROR ("NAL %d not loaded\n", args->ioc_nal);
-                pingcli_shutdown (4);
+                pingcli_shutdown (nih, 4);
                 return (NULL);
         }
 
         /* Based on the initialization aquire our unique portal ID. */
-        if ((rc = PtlGetId (*nip, &client->myid)))
+        if ((rc = PtlGetId (nih, &client->myid)))
         {
                 CERROR ("PtlGetId error %d\n", rc);
-                pingcli_shutdown (2);
+                pingcli_shutdown (nih, 2);
                 return (NULL);
         }
 
@@ -164,20 +163,20 @@ pingcli_start(struct portal_ioctl_data *args)
         client->id_remote.nid = args->ioc_nid;
         client->id_remote.pid = 0;
 
-        if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT,
+        if ((rc = PtlMEAttach (nih, PTL_PING_CLIENT,
                    client->id_local, 0, ~0, PTL_RETAIN,
                    PTL_INS_AFTER, &client->me)))
         {
                 CERROR ("PtlMEAttach error %d\n", rc);
-                pingcli_shutdown (2);
+                pingcli_shutdown (nih, 2);
                 return (NULL);
         }
 
         /* Allocate the event queue for this network interface */
-        if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq)))
+        if ((rc = PtlEQAlloc (nih, 64, pingcli_callback, &client->eq)))
         {
                 CERROR ("PtlEQAlloc error %d\n", rc);
-                pingcli_shutdown (2);
+                pingcli_shutdown (nih, 2);
                 return (NULL);
         }
 
@@ -187,35 +186,35 @@ pingcli_start(struct portal_ioctl_data *args)
         client->md_in_head.length    = (args->ioc_size + STDSIZE)
                                                 * count;
         client->md_in_head.threshold = PTL_MD_THRESH_INF;
-        client->md_in_head.options   = PTL_MD_OP_PUT;
+        client->md_in_head.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
         client->md_in_head.user_ptr  = NULL;
-        client->md_in_head.eventq    = client->eq;
+        client->md_in_head.eq_handle = client->eq;
         memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count);
 
         /* Attach the incoming buffer */
         if ((rc = PtlMDAttach (client->me, client->md_in_head,
                               PTL_UNLINK, &client->md_in_head_h))) {
                 CERROR ("PtlMDAttach error %d\n", rc);
-                pingcli_shutdown (1);
+                pingcli_shutdown (nih, 1);
                 return (NULL);
         }
         /* Setup the outgoing ping header */
         client->md_out_head.start     = client->outbuf;
         client->md_out_head.length    = STDSIZE + args->ioc_size;
         client->md_out_head.threshold = args->ioc_count;
-        client->md_out_head.options   = PTL_MD_OP_PUT;
+        client->md_out_head.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
         client->md_out_head.user_ptr  = NULL;
-        client->md_out_head.eventq    = PTL_EQ_NONE;
+        client->md_out_head.eq_handle = PTL_EQ_NONE;
 
         memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic));
 
         count = 0;
 
         /* Bind the outgoing ping header */
-        if ((rc=PtlMDBind (*nip, client->md_out_head,
-                                        &client->md_out_head_h))) {
+        if ((rc=PtlMDBind (nih, client->md_out_head,
+                           PTL_UNLINK, &client->md_out_head_h))) {
                 CERROR ("PtlMDBind error %d\n", rc);
-                pingcli_shutdown (1);
+                pingcli_shutdown (nih, 1);
                 return NULL;
         }
         while ((args->ioc_count - count)) {
@@ -230,20 +229,20 @@ pingcli_start(struct portal_ioctl_data *args)
                 if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
                           client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
                          PDEBUG ("PtlPut (header)", rc);
-                         pingcli_shutdown (1);
+                         pingcli_shutdown (nih, 1);
                          return NULL;
                 }
-                printk ("Lustre: sent msg no %d", count);
+                CWARN ("Lustre: sent msg no %d", count);
 
                 set_current_state (TASK_INTERRUPTIBLE);
                 rc = schedule_timeout (20 * args->ioc_timeout);
                 if (rc == 0) {
-                        printk ("LustreError:   ::  timeout .....\n");
+                        CERROR ("timeout .....\n");
                 } else {
                         do_gettimeofday (&tv2);
-                        printk("Lustre:   ::  Reply in %u usec\n",
-                                (unsigned)((tv2.tv_sec - tv1.tv_sec)
-                                 * 1000000 +  (tv2.tv_usec - tv1.tv_usec)));
+                        CWARN("Reply in %u usec\n",
+                              (unsigned)((tv2.tv_sec - tv1.tv_sec)
+                                         * 1000000 +  (tv2.tv_usec - tv1.tv_usec)));
                 }
                 count++;
         }
@@ -255,7 +254,7 @@ pingcli_start(struct portal_ioctl_data *args)
                 PORTAL_FREE (client->inbuf,
                                (args->ioc_size + STDSIZE) * args->ioc_count);
 
-        pingcli_shutdown (2);
+        pingcli_shutdown (nih, 2);
 
         /* Success! */
         return NULL;
index 1e40ed8..dec806a 100644 (file)
@@ -81,7 +81,7 @@ static void *pingsrv_shutdown(int err)
                                         PDEBUG ("PtlMEUnlink", rc);
 
                 case 3:
-                        kportal_put_ni (nal);
+                        PtlNIFini (server->ni);
 
                 case 4:
                         
@@ -116,12 +116,12 @@ int pingsrv_thread(void *arg)
                         continue;
                 }
                
-                magic =  *((int *)(server->evnt.mem_desc.start 
+                magic =  *((int *)(server->evnt.md.start 
                                         + server->evnt.offset));
                 
                 
                 if(magic != 0xdeadbeef) {
-                        printk("LustreError: Unexpected Packet to the server\n");
+                        CERROR("Unexpected Packet to the server\n");
                         
                 } 
                 memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic));
@@ -129,13 +129,13 @@ int pingsrv_thread(void *arg)
                 server->mdout.length    = server->evnt.rlength;
                 server->mdout.start     = server->in_buf;
                 server->mdout.threshold = 1; 
-                server->mdout.options   = PTL_MD_OP_PUT;
+                server->mdout.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
                 server->mdout.user_ptr  = NULL;
-                server->mdout.eventq    = PTL_EQ_NONE;
+                server->mdout.eq_handle = PTL_EQ_NONE;
        
                 /* Bind the outgoing buffer */
                 if ((rc = PtlMDBind (server->ni, server->mdout, 
-                                                &server->mdout_h))) {
+                                     PTL_UNLINK, &server->mdout_h))) {
                          PDEBUG ("PtlMDBind", rc);
                          pingsrv_shutdown (1);
                          return 1;
@@ -145,9 +145,9 @@ int pingsrv_thread(void *arg)
                 server->mdin.start     = server->in_buf;
                 server->mdin.length    = MAXSIZE;
                 server->mdin.threshold = 1; 
-                server->mdin.options   = PTL_MD_OP_PUT;
+                server->mdin.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
                 server->mdin.user_ptr  = NULL;
-                server->mdin.eventq    = server->eq;
+                server->mdin.eq_handle = server->eq;
         
                 if ((rc = PtlMDAttach (server->me, server->mdin,
                         PTL_UNLINK, &server->mdin_h))) {
@@ -167,49 +167,49 @@ int pingsrv_thread(void *arg)
         return 0;    
 }
 
-static int pingsrv_packet(ptl_event_t *ev)
+static void pingsrv_packet(ptl_event_t *ev)
 {
         atomic_inc (&pkt);
         wake_up_process (server->tsk);
-        return 1;
 } /* pingsrv_head() */
 
-static int pingsrv_callback(ptl_event_t *ev)
+static void pingsrv_callback(ptl_event_t *ev)
 {
         
         if (ev == NULL) {
                 CERROR ("null in callback, ev=%p\n", ev);
-                return 0;
+                return;
         }
         server->evnt = *ev;
         
-        printk ("Lustre: received ping from nid "LPX64" "
+        CWARN ("received ping from nid "LPX64" "
                "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n",
                ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
-               *((int *)(ev->mem_desc.start + ev->offset)),
-               *((int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned))),
-               *((int *)(ev->mem_desc.start + ev->offset + 2 * 
+               *((int *)(ev->md.start + ev->offset)),
+               *((int *)(ev->md.start + ev->offset + sizeof(unsigned))),
+               *((int *)(ev->md.start + ev->offset + 2 * 
                                sizeof(unsigned))));
         
         packets_valid++;
 
-        return pingsrv_packet(ev);
+        pingsrv_packet(ev);
         
 } /* pingsrv_callback() */
 
 
 static struct pingsrv_data *pingsrv_setup(void)
 {
-        ptl_handle_ni_t *nip;
         int rc;
 
+        server->ni = PTL_INVALID_HANDLE;
+
        /* Aquire and initialize the proper nal for portals. */
-        if ((nip = kportal_get_ni (nal)) == NULL) {
+        rc = PtlNIInit(nal, 0, NULL, NULL, &server->ni);
+        if (!(rc == PTL_OK || rc == PTL_IFACE_DUP)) {
                 CDEBUG (D_OTHER, "NAL %d not loaded\n", nal);
                 return pingsrv_shutdown (4);
         }
 
-        server->ni= *nip;
 
         /* Based on the initialization aquire our unique portal ID. */
         if ((rc = PtlGetId (server->ni, &server->my_id))) {
@@ -229,7 +229,7 @@ static struct pingsrv_data *pingsrv_setup(void)
         }
 
 
-        if ((rc = PtlEQAlloc (server->ni, 1024, pingsrv_callback,
+        if ((rc = PtlEQAlloc (server->ni, 1024, &pingsrv_callback,
                                         &server->eq))) {
                 PDEBUG ("PtlEQAlloc (callback)", rc);
                 return pingsrv_shutdown (2);
@@ -245,9 +245,9 @@ static struct pingsrv_data *pingsrv_setup(void)
         server->mdin.start     = server->in_buf;
         server->mdin.length    = MAXSIZE;
         server->mdin.threshold = 1; 
-        server->mdin.options   = PTL_MD_OP_PUT;
+        server->mdin.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
         server->mdin.user_ptr  = NULL;
-        server->mdin.eventq    = server->eq;
+        server->mdin.eq_handle = server->eq;
         memset (server->in_buf, 0, STDSIZE);
         
         if ((rc = PtlMDAttach (server->me, server->mdin,
@@ -298,7 +298,7 @@ static void /*__exit*/ pingsrv_cleanup(void)
 
 MODULE_PARM(nal, "i");
 MODULE_PARM_DESC(nal, "Use the specified NAL "
-                "(6-kscimacnal, 2-ksocknal, 1-kqswnal)");
+                "(2-ksocknal, 1-kqswnal)");
  
 MODULE_AUTHOR("Brian Behlendorf (LLNL)");
 MODULE_DESCRIPTION("A kernel space ping server for portals testing");
index 64a1dd2..730ba00 100644 (file)
@@ -51,7 +51,7 @@ static struct pingcli_data *client = NULL;
 static int count = 0;
 
 static void
-pingcli_shutdown(int err)
+pingcli_shutdown(ptl_handle_ni_t nih, int err)
 {
         int rc;
 
@@ -72,7 +72,7 @@ pingcli_shutdown(int err)
                         if ((rc = PtlMEUnlink (client->me)))
                                 PDEBUG ("PtlMEUnlink", rc);
                 case 3:
-                        kportal_put_ni (client->args->ioc_nal);
+                        PtlNIFini (nih);
 
                 case 4:
                         /* Free our buffers */
@@ -92,17 +92,16 @@ pingcli_shutdown(int err)
         CDEBUG (D_OTHER, "ping client released resources\n");
 } /* pingcli_shutdown() */
 
-static int pingcli_callback(ptl_event_t *ev)
+static void pingcli_callback(ptl_event_t *ev)
 {
-                wake_up_process (client->tsk);
-        return 1;
+        wake_up_process (client->tsk);
 }
 
 
 static struct pingcli_data *
 pingcli_start(struct portal_ioctl_data *args)
 {
-        const ptl_handle_ni_t *nip;
+        ptl_handle_ni_t nih = PTL_INVALID_HANDLE;
         unsigned ping_head_magic = PING_HEADER_MAGIC;
         char str[PTL_NALFMT_SIZE];
         int rc;
@@ -122,7 +121,7 @@ pingcli_start(struct portal_ioctl_data *args)
         if (client->outbuf == NULL)
         {
                 CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
-                pingcli_shutdown (4);
+                pingcli_shutdown (nih, 4);
                 return (NULL);
         }
 
@@ -131,23 +130,24 @@ pingcli_start(struct portal_ioctl_data *args)
         if (client->inbuf == NULL)
         {
                 CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
-                pingcli_shutdown (4);
+                pingcli_shutdown (nih, 4);
                 return (NULL);
         }
 
         /* Aquire and initialize the proper nal for portals. */
-        if ((nip = kportal_get_ni (args->ioc_nal)) == NULL)
+        rc = PtlNIInit(args->ioc_nal, 0, NULL, NULL, &nih);
+        if (rc != PTL_OK && rc != PTL_IFACE_DUP)
         {
                 CERROR ("NAL %d not loaded.\n", args->ioc_nal);
-                pingcli_shutdown (4);
+                pingcli_shutdown (nih, 4);
                 return (NULL);
         }
 
         /* Based on the initialization aquire our unique portal ID. */
-        if ((rc = PtlGetId (*nip, &client->myid)))
+        if ((rc = PtlGetId (nih, &client->myid)))
         {
                 CERROR ("PtlGetId error %d\n", rc);
-                pingcli_shutdown (2);
+                pingcli_shutdown (nih, 2);
                 return (NULL);
         }
 
@@ -159,20 +159,20 @@ pingcli_start(struct portal_ioctl_data *args)
         client->id_remote.nid = args->ioc_nid;
         client->id_remote.pid = 0;
 
-        if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT,
+        if ((rc = PtlMEAttach (nih, PTL_PING_CLIENT,
                    client->id_local, 0, ~0, PTL_RETAIN,
                    PTL_INS_AFTER, &client->me)))
         {
                 CERROR ("PtlMEAttach error %d\n", rc);
-                pingcli_shutdown (2);
+                pingcli_shutdown (nih, 2);
                 return (NULL);
         }
 
         /* Allocate the event queue for this network interface */
-        if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq)))
+        if ((rc = PtlEQAlloc (nih, 64, pingcli_callback, &client->eq)))
         {
                 CERROR ("PtlEQAlloc error %d\n", rc);
-                pingcli_shutdown (2);
+                pingcli_shutdown (nih, 2);
                 return (NULL);
         }
 
@@ -180,16 +180,16 @@ pingcli_start(struct portal_ioctl_data *args)
         client->md_in_head.start     = client->inbuf;
         client->md_in_head.length    = STDSIZE;
         client->md_in_head.threshold = 1;
-        client->md_in_head.options   = PTL_MD_OP_PUT;
+        client->md_in_head.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
         client->md_in_head.user_ptr  = NULL;
-        client->md_in_head.eventq    = client->eq;
+        client->md_in_head.eq_handle = client->eq;
         memset (client->inbuf, 0, STDSIZE);
 
         /* Attach the incoming buffer */
         if ((rc = PtlMDAttach (client->me, client->md_in_head,
                               PTL_UNLINK, &client->md_in_head_h))) {
                 CERROR ("PtlMDAttach error %d\n", rc);
-                pingcli_shutdown (1);
+                pingcli_shutdown (nih, 1);
                 return (NULL);
         }
 
@@ -197,24 +197,24 @@ pingcli_start(struct portal_ioctl_data *args)
         client->md_out_head.start     = client->outbuf;
         client->md_out_head.length    = STDSIZE;
         client->md_out_head.threshold = 1;
-        client->md_out_head.options   = PTL_MD_OP_PUT;
+        client->md_out_head.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
         client->md_out_head.user_ptr  = NULL;
-        client->md_out_head.eventq    = PTL_EQ_NONE;
+        client->md_out_head.eq_handle = PTL_EQ_NONE;
 
         memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic));
 
         /* Bind the outgoing ping header */
-        if ((rc=PtlMDBind (*nip, client->md_out_head,
-                                        &client->md_out_head_h))) {
+        if ((rc=PtlMDBind (nih, client->md_out_head,
+                           PTL_UNLINK, &client->md_out_head_h))) {
                 CERROR ("PtlMDBind error %d\n", rc);
-                pingcli_shutdown (1);
+                pingcli_shutdown (nih, 1);
                 return (NULL);
         }
         /* Put the ping packet */
         if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
                          client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
                 PDEBUG ("PtlPut (header)", rc);
-                pingcli_shutdown (1);
+                pingcli_shutdown (nih, 1);
                 return NULL;
         }
 
@@ -222,14 +222,14 @@ pingcli_start(struct portal_ioctl_data *args)
         set_current_state (TASK_INTERRUPTIBLE);
         rc = schedule_timeout (20 * args->ioc_timeout);
         if (rc == 0) {
-                printk ("LustreError: Time out on the server\n");
-                pingcli_shutdown (2);
+                CERROR ("Time out on the server\n");
+                pingcli_shutdown (nih, 2);
                 return NULL;
-        } else
-                printk("Lustre: Received respose from the server \n");
-
+        } else {
+                CWARN("Received respose from the server \n");
+        }
 
-        pingcli_shutdown (2);
+        pingcli_shutdown (nih, 2);
 
         /* Success! */
         return NULL;
index b8bda29..f2382d1 100644 (file)
@@ -53,7 +53,7 @@
 
 #define STDSIZE (sizeof(int) + sizeof(int) + 4)
 
-static int nal  = 0;                            // Your NAL,
+static int nal  = PTL_IFACE_DEFAULT;            // Your NAL,
 static unsigned long packets_valid = 0;         // Valid packets 
 static int running = 1;
 atomic_t pkt;
@@ -86,7 +86,7 @@ static void *pingsrv_shutdown(int err)
                                         PDEBUG ("PtlMEUnlink", rc);
 
                 case 3:
-                        kportal_put_ni (nal);
+                        PtlNIFini(server->ni);
 
                 case 4:
                         
@@ -121,13 +121,13 @@ int pingsrv_thread(void *arg)
                 server->mdout.start     = server->in_buf;
                 server->mdout.length    = STDSIZE;
                 server->mdout.threshold = 1; 
-                server->mdout.options   = PTL_MD_OP_PUT;
+                server->mdout.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
                 server->mdout.user_ptr  = NULL;
-                server->mdout.eventq    = PTL_EQ_NONE;
+                server->mdout.eq_handle = PTL_EQ_NONE;
        
                 /* Bind the outgoing buffer */
                 if ((rc = PtlMDBind (server->ni, server->mdout, 
-                                                &server->mdout_h))) {
+                                     PTL_UNLINK, &server->mdout_h))) {
                          PDEBUG ("PtlMDBind", rc);
                          pingsrv_shutdown (1);
                          return 1;
@@ -137,9 +137,9 @@ int pingsrv_thread(void *arg)
                 server->mdin.start     = server->in_buf;
                 server->mdin.length    = STDSIZE;
                 server->mdin.threshold = 1; 
-                server->mdin.options   = PTL_MD_OP_PUT;
+                server->mdin.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
                 server->mdin.user_ptr  = NULL;
-                server->mdin.eventq    = server->eq;
+                server->mdin.eq_handle = server->eq;
         
                 if ((rc = PtlMDAttach (server->me, server->mdin,
                         PTL_UNLINK, &server->mdin_h))) {
@@ -159,47 +159,46 @@ int pingsrv_thread(void *arg)
         return 0;    
 }
 
-static int pingsrv_packet(ptl_event_t *ev)
+static void pingsrv_packet(ptl_event_t *ev)
 {
         atomic_inc (&pkt);
         wake_up_process (server->tsk);
-        return 1;
 } /* pingsrv_head() */
 
-static int pingsrv_callback(ptl_event_t *ev)
+static void pingsrv_callback(ptl_event_t *ev)
 {
         
         if (ev == NULL) {
                 CERROR ("null in callback, ev=%p\n", ev);
-                return 0;
+                return;
         }
         server->evnt = *ev;
         
-        printk ("Lustre: received ping from nid "LPX64" "
-               "(off=%u rlen=%u mlen=%u head=%x)\n",
-               ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
-               *((int *)(ev->mem_desc.start + ev->offset)));
+        CWARN("Lustre: received ping from nid "LPX64" "
+              "(off=%u rlen=%u mlen=%u head=%x)\n",
+              ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
+              *((int *)(ev->md.start + ev->offset)));
         
         packets_valid++;
 
-        return pingsrv_packet(ev);
+        pingsrv_packet(ev);
         
 } /* pingsrv_callback() */
 
 
 static struct pingsrv_data *pingsrv_setup(void)
 {
-        ptl_handle_ni_t *nip;
         int rc;
 
        /* Aquire and initialize the proper nal for portals. */
-        if ((nip = kportal_get_ni (nal)) == NULL) {
+        server->ni = PTL_INVALID_HANDLE;
+
+        rc = PtlNIInit(nal, 0, NULL, NULL, &server->ni);
+        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
                 CDEBUG (D_OTHER, "Nal %d not loaded.\n", nal);
                 return pingsrv_shutdown (4);
         }
 
-        server->ni= *nip;
-
         /* Based on the initialization aquire our unique portal ID. */
         if ((rc = PtlGetId (server->ni, &server->my_id))) {
                 PDEBUG ("PtlGetId", rc);
@@ -234,9 +233,9 @@ static struct pingsrv_data *pingsrv_setup(void)
         server->mdin.start     = server->in_buf;
         server->mdin.length    = STDSIZE;
         server->mdin.threshold = 1; 
-        server->mdin.options   = PTL_MD_OP_PUT;
+        server->mdin.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
         server->mdin.user_ptr  = NULL;
-        server->mdin.eventq    = server->eq;
+        server->mdin.eq_handle = server->eq;
         memset (server->in_buf, 0, STDSIZE);
         
         if ((rc = PtlMDAttach (server->me, server->mdin,
@@ -285,7 +284,7 @@ static void /*__exit*/ pingsrv_cleanup(void)
 
 MODULE_PARM(nal, "i");
 MODULE_PARM_DESC(nal, "Use the specified NAL "
-                "(6-kscimacnal, 2-ksocknal, 1-kqswnal)");
+                "(2-ksocknal, 1-kqswnal)");
  
 MODULE_AUTHOR("Brian Behlendorf (LLNL)");
 MODULE_DESCRIPTION("A kernel space ping server for portals testing");
index 15080b0..3437d39 100644 (file)
@@ -1,13 +1,10 @@
 if LIBLUSTRE
+if !CRAY_PORTALS
 noinst_LIBRARIES = libtcpnal.a
 endif
+endif
 
-noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h     \
-       ipmap.h bridge.h procbridge.h
-
-libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h     \
-       dispatch.h table.h timer.h address.c procapi.c proclib.c        \
-       connection.c tcpnal.c connection.h
-
+noinst_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
 libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS)
 libtcpnal_a_CFLAGS = $(LLCFLAGS)
index 6507924..f329e2a 100644 (file)
@@ -91,8 +91,8 @@ void set_address(bridge t,ptl_pid_t pidrequest)
     int port;
     if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0;
     else port=pidrequest;
-    t->nal_cb->ni.nid=get_node_id();
-    t->nal_cb->ni.pid=port;
+    t->lib_nal->libnal_ni.ni_pid.nid=get_node_id();
+    t->lib_nal->libnal_ni.ni_pid.pid=port;
 }
 #else
 
@@ -120,10 +120,9 @@ void set_address(bridge t,ptl_pid_t pidrequest)
     in_addr = get_node_id();
 
     t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */
-    t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) 
-                            << PNAL_VNODE_SHIFT)
-        + virtnode;
-
+    t->lib_nal->libnal_ni.ni_pid.nid = ((in_addr & PNAL_HOSTID_MASK) 
+                                        << PNAL_VNODE_SHIFT)
+                                       + virtnode;
     pid=pidrequest;
     /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */
 #ifdef notyet
@@ -141,6 +140,6 @@ void set_address(bridge t,ptl_pid_t pidrequest)
             return;
         }
     else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT;
-    t->nal_cb->ni.pid=pid;
+    t->lib_nal->libnal_ni.ni_pid.pid=pid;
 }
 #endif
index 9a90ab8..d2f0f2c 100644 (file)
 #define TCPNAL_PROCBRIDGE_H
 
 #include <portals/lib-p30.h>
+#include <portals/nal.h>
+
+#define PTL_IFACE_TCP 1
+#define PTL_IFACE_ER 2
+#define PTL_IFACE_SS 3
+#define PTL_IFACE_MAX 4
 
 typedef struct bridge {
     int alive;
-    nal_cb_t *nal_cb;
+    lib_nal_t *lib_nal;
     void *lower;
     void *local;
     void (*shutdown)(struct bridge *);
@@ -22,12 +28,6 @@ typedef struct bridge {
 } *bridge;
 
 
-nal_t *bridge_init(ptl_interface_t nal,
-                   ptl_pid_t pid_request,
-                   ptl_ni_limits_t *desired,
-                   ptl_ni_limits_t *actual,
-                   int *rc);
-
 typedef int (*nal_initialize)(bridge);
 extern nal_initialize nal_table[PTL_IFACE_MAX];
 
index 7b4cecd..b399fcf 100644 (file)
@@ -201,35 +201,30 @@ static int new_connection(void *z)
     return(1);
 }
 
-/* FIXME assuming little endian, cleanup!! */
-#define __cpu_to_le64(x) ((__u64)(x))
-#define __le64_to_cpu(x) ((__u64)(x))
-#define __cpu_to_le32(x) ((__u32)(x))
-#define __le32_to_cpu(x) ((__u32)(x))
-#define __cpu_to_le16(x) ((__u16)(x))
-#define __le16_to_cpu(x) ((__u16)(x))
-
 extern ptl_nid_t tcpnal_mynid;
 
 int
 tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
 {
         int                 rc;
+        int                 nob;
         ptl_hdr_t           hdr;
         ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
 
         LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
 
         memset (&hdr, 0, sizeof (hdr));
-        hmv->magic         = __cpu_to_le32 (PORTALS_PROTO_MAGIC);
-        hmv->version_major = __cpu_to_le32 (PORTALS_PROTO_VERSION_MAJOR);
-        hmv->version_minor = __cpu_to_le32 (PORTALS_PROTO_VERSION_MINOR);
+        hmv->magic         = cpu_to_le32(PORTALS_PROTO_MAGIC);
+        hmv->version_major = cpu_to_le32(PORTALS_PROTO_VERSION_MAJOR);
+        hmv->version_minor = cpu_to_le32(PORTALS_PROTO_VERSION_MINOR);
         
-        hdr.src_nid = __cpu_to_le64 (tcpnal_mynid);
-        hdr.type    = __cpu_to_le32 (PTL_MSG_HELLO);
+        hdr.src_nid = cpu_to_le64(tcpnal_mynid);
+        hdr.type    = cpu_to_le32(PTL_MSG_HELLO);
+
+        hdr.msg.hello.type = cpu_to_le32(type);
+        hdr.msg.hello.incarnation = cpu_to_le64(incarnation);
 
-        hdr.msg.hello.type = __cpu_to_le32 (type);
-        hdr.msg.hello.incarnation = 0;
+        /* I don't send any interface info */
 
         /* Assume sufficient socket buffering for this message */
         rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr));
@@ -244,28 +239,28 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
                 return (rc);
         }
         
-        if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) {
+        if (hmv->magic != le32_to_cpu(PORTALS_PROTO_MAGIC)) {
                 CERROR ("Bad magic %#08x (%#08x expected) from "LPX64"\n",
-                        __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, *nid);
+                        cpu_to_le32(hmv->magic), PORTALS_PROTO_MAGIC, *nid);
                 return (-EPROTO);
         }
 
-        if (hmv->version_major != __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) ||
-            hmv->version_minor != __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) {
+        if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) ||
+            hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) {
                 CERROR ("Incompatible protocol version %d.%d (%d.%d expected)"
                         " from "LPX64"\n",
-                        __le16_to_cpu (hmv->version_major),
-                        __le16_to_cpu (hmv->version_minor),
+                        le16_to_cpu (hmv->version_major),
+                        le16_to_cpu (hmv->version_minor),
                         PORTALS_PROTO_VERSION_MAJOR,
                         PORTALS_PROTO_VERSION_MINOR,
                         *nid);
                 return (-EPROTO);
         }
 
-#if (PORTALS_PROTO_VERSION_MAJOR != 0)
-# error "This code only understands protocol version 0.x"
+#if (PORTALS_PROTO_VERSION_MAJOR != 1)
+# error "This code only understands protocol version 1.x"
 #endif
-        /* version 0 sends magic/version as the dest_nid of a 'hello' header,
+        /* version 1 sends magic/version as the dest_nid of a 'hello' header,
          * so read the rest of it in now... */
 
         rc = syscall(SYS_read, sockfd, hmv + 1, sizeof(hdr) - sizeof(*hmv));
@@ -276,27 +271,49 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
         }
 
         /* ...and check we got what we expected */
-        if (hdr.type != __cpu_to_le32 (PTL_MSG_HELLO) ||
-            hdr.payload_length != __cpu_to_le32 (0)) {
-                CERROR ("Expecting a HELLO hdr with 0 payload,"
+        if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) {
+                CERROR ("Expecting a HELLO hdr "
                         " but got type %d with %d payload from "LPX64"\n",
-                        __le32_to_cpu (hdr.type),
-                        __le32_to_cpu (hdr.payload_length), *nid);
+                        le32_to_cpu (hdr.type),
+                        le32_to_cpu (hdr.payload_length), *nid);
                 return (-EPROTO);
         }
 
-        if (__le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) {
+        if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) {
                 CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY\n");
                 return (-EPROTO);
         }
 
         if (*nid == PTL_NID_ANY) {              /* don't know peer's nid yet */
-                *nid = __le64_to_cpu(hdr.src_nid);
-        } else if (*nid != __le64_to_cpu (hdr.src_nid)) {
+                *nid = le64_to_cpu(hdr.src_nid);
+        } else if (*nid != le64_to_cpu (hdr.src_nid)) {
                 CERROR ("Connected to nid "LPX64", but expecting "LPX64"\n",
-                        __le64_to_cpu (hdr.src_nid), *nid);
+                        le64_to_cpu (hdr.src_nid), *nid);
+                return (-EPROTO);
+        }
+
+        /* Ignore any interface info in the payload */
+        nob = le32_to_cpu(hdr.payload_length);
+        if (nob > getpagesize()) {
+                CERROR("Unexpected HELLO payload %d from "LPX64"\n",
+                       nob, *nid);
                 return (-EPROTO);
         }
+        if (nob > 0) {
+                char *space = (char *)malloc(nob);
+                
+                if (space == NULL) {
+                        CERROR("Can't allocate scratch buffer %d\n", nob);
+                        return (-ENOMEM);
+                }
+                
+                rc = syscall(SYS_read, sockfd, space, nob);
+                if (rc <= 0) {
+                        CERROR("Error %d skipping HELLO payload from "
+                               LPX64"\n", rc, *nid);
+                        return (rc);
+                }
+        }
 
         return (0);
 }
@@ -325,6 +342,8 @@ connection force_tcp_connection(manager m,
     int rport;
     ptl_nid_t peernid = PTL_NID_ANY;
 
+    port = tcpnal_acceptor_port;
+
     id[0] = ip;
     id[1] = port;
 
@@ -366,7 +385,7 @@ connection force_tcp_connection(manager m,
                                  sizeof(struct sockaddr_in));
                     if (rc == 0) {
                             break;
-                    } else if (errno != EADDRINUSE) {
+                    } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) {
                             perror("Error connecting to remote host");
                             close(fd);
                             goto out;
@@ -411,6 +430,7 @@ out:
     return (conn);
 }
 
+
 /* Function:  bind_socket
  * Arguments: t: the nal state for this interface
  *            port: the port to attempt to bind to
index 34dd070..a8f916d 100644 (file)
@@ -37,3 +37,10 @@ void remove_io_handler (io_handler i);
 void init_unix_timer(void);
 void select_timer_block(when until);
 when now(void);
+
+/*
+ * hacking for CFS internal MPI testing
+ */ 
+#if !CRAY_PORTALS
+#define ENABLE_SELECT_DISPATCH
+#endif
index c27f555..6b471c0 100644 (file)
@@ -60,34 +60,6 @@ void procbridge_wakeup_nal(procbridge p)
     syscall(SYS_write, p->notifier[0], buf, sizeof(buf));
 }
 
-/* Function: forward
- * Arguments: nal_t *nal: pointer to my top-side nal structure
- *            id: the command to pass to the lower layer
- *            args, args_len:pointer to and length of the request
- *            ret, ret_len:  pointer to and size of the result
- * Returns: a portals status code
- *
- * forwards a packaged api call from the 'api' side to the 'library'
- *   side, and collects the result
- */
-static int procbridge_forward(nal_t *n, int id, void *args, size_t args_len,
-                             void *ret, size_t ret_len)
-{
-    bridge b = (bridge) n->nal_data;
-
-    if (id == PTL_FINI) {
-            lib_fini(b->nal_cb);
-
-            if (b->shutdown)
-                (*b->shutdown)(b);
-    }
-
-    lib_dispatch(b->nal_cb, NULL, id, args, ret);
-
-    return (PTL_OK);
-}
-
-
 /* Function: shutdown
  * Arguments: nal: a pointer to my top side nal structure
  *            ni: my network interface index
@@ -95,9 +67,10 @@ static int procbridge_forward(nal_t *n, int id, void *args, size_t args_len,
  * cleanup nal state, reclaim the lower side thread and
  *   its state using PTL_FINI codepoint
  */
-static int procbridge_shutdown(nal_t *n, int ni)
+static void procbridge_shutdown(nal_t *n)
 {
-    bridge b=(bridge)n->nal_data;
+    lib_nal_t *nal = n->nal_data;
+    bridge b=(bridge)nal->libnal_data;
     procbridge p=(procbridge)b->local;
 
     p->nal_flags |= NAL_FLAG_STOPPING;
@@ -114,77 +87,31 @@ static int procbridge_shutdown(nal_t *n, int ni)
     } while (1);
 
     free(p);
-    return(0);
-}
-
-
-/* Function: validate
- *    useless stub
- */
-static int procbridge_validate(nal_t *nal, void *base, size_t extent)
-{
-    return(0);
 }
 
 
-/* FIXME cfs temporary workaround! FIXME
- * global time out value
- */
-int __tcpnal_eqwait_timeout_value = 0;
-int __tcpnal_eqwait_timedout = 0;
-
-/* Function: yield
- * Arguments:  pid:
- *
- *  this function was originally intended to allow the
- *   lower half thread to be scheduled to allow progress. we
- *   overload it to explicitly block until signalled by the
- *   lower half.
- */
-static void procbridge_yield(nal_t *n)
-{
-    bridge b=(bridge)n->nal_data;
-    procbridge p=(procbridge)b->local;
+/* forward decl */
+extern int procbridge_startup (nal_t *, ptl_pid_t,
+                               ptl_ni_limits_t *, ptl_ni_limits_t *);
 
-    pthread_mutex_lock(&p->mutex);
-    if (!__tcpnal_eqwait_timeout_value) {
-        pthread_cond_wait(&p->cond,&p->mutex);
-    } else {
-        struct timeval now;
-        struct timespec timeout;
-
-        gettimeofday(&now, NULL);
-        timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value;
-        timeout.tv_nsec = now.tv_usec * 1000;
-
-        __tcpnal_eqwait_timedout =
-                pthread_cond_timedwait(&p->cond, &p->mutex, &timeout);
-    }
-    pthread_mutex_unlock(&p->mutex);
-}
-
-
-static void procbridge_lock(nal_t * nal, unsigned long *flags){}
-static void procbridge_unlock(nal_t * nal, unsigned long *flags){}
 /* api_nal
  *  the interface vector to allow the generic code to access
- *  this nal. this is seperate from the library side nal_cb.
+ *  this nal. this is seperate from the library side lib_nal.
  *  TODO: should be dyanmically allocated
  */
-static nal_t api_nal = {
-    ni:       {0},
+nal_t procapi_nal = {
     nal_data: NULL,
-    forward:  procbridge_forward,
-    shutdown: procbridge_shutdown,
-    validate: procbridge_validate,
-    yield:    procbridge_yield,
-    lock:     procbridge_lock,
-    unlock:   procbridge_unlock
+    nal_ni_init: procbridge_startup,
+    nal_ni_fini: procbridge_shutdown,
 };
 
 ptl_nid_t tcpnal_mynid;
 
-/* Function: procbridge_interface
+#ifdef ENABLE_SELECT_DISPATCH
+procbridge __global_procbridge = NULL;
+#endif
+
+/* Function: procbridge_startup
  *
  * Arguments:  pid: requested process id (port offset)
  *                  PTL_ID_ANY not supported.
@@ -192,65 +119,62 @@ ptl_nid_t tcpnal_mynid;
  *                      and effectively ignored
  *             actual:  limits actually allocated and returned
  *
- * Returns: a pointer to my statically allocated top side NAL
- *          structure
+ * Returns: portals rc
  *
  * initializes the tcp nal. we define unix_failure as an
  * error wrapper to cut down clutter.
  */
-nal_t *procbridge_interface(int num_interface,
-                            ptl_pt_index_t ptl_size,
-                            ptl_ac_index_t acl_size,
-                            ptl_pid_t requested_pid)
+int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid,
+                        ptl_ni_limits_t *requested_limits,
+                        ptl_ni_limits_t *actual_limits)
 {
     nal_init_args_t args;
+
     procbridge p;
     bridge b;
-    static int initialized=0;
-    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+    /* XXX nal_type is purely private to tcpnal here */
     int nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */
 
-    if(initialized) return (&api_nal);
+    LASSERT(nal == &procapi_nal);
 
     init_unix_timer();
 
     b=(bridge)malloc(sizeof(struct bridge));
     p=(procbridge)malloc(sizeof(struct procbridge));
-    api_nal.nal_data=b;
     b->local=p;
 
-    if (ptl_size)
-           limits.max_ptable_index = ptl_size;
-    if (acl_size)
-           limits.max_atable_index = acl_size;
-
     args.nia_requested_pid = requested_pid;
-    args.nia_limits = &limits;
+    args.nia_requested_limits = requested_limits;
+    args.nia_actual_limits = actual_limits;
     args.nia_nal_type = nal_type;
     args.nia_bridge = b;
+    args.nia_apinal = nal;
 
     /* init procbridge */
     pthread_mutex_init(&p->mutex,0);
     pthread_cond_init(&p->cond, 0);
     p->nal_flags = 0;
-    pthread_mutex_init(&p->nal_cb_lock, 0);
 
     /* initialize notifier */
     if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) {
         perror("socketpair failed");
-        return NULL;
+        return PTL_FAIL;
     }
 
     if (!register_io_handler(p->notifier[1], READ_HANDLER,
                 procbridge_notifier_handler, p)) {
         perror("fail to register notifier handler");
-        return NULL;
+        return PTL_FAIL;
     }
 
+#ifdef ENABLE_SELECT_DISPATCH
+    __global_procbridge = p;
+#endif
+
     /* create nal thread */
     if (pthread_create(&p->t, NULL, nal_thread, &args)) {
         perror("nal_init: pthread_create");
-        return(NULL);
+        return PTL_FAIL;
     }
 
     do {
@@ -264,10 +188,9 @@ nal_t *procbridge_interface(int num_interface,
     } while (1);
 
     if (p->nal_flags & NAL_FLAG_STOPPED)
-        return (NULL);
+        return PTL_FAIL;
 
-    b->nal_cb->ni.nid = tcpnal_mynid;
-    initialized = 1;
+    b->lib_nal->libnal_ni.ni_pid.nid = tcpnal_mynid;
 
-    return (&api_nal);
+    return PTL_OK;
 }
index 965f83d..1f91ced 100644 (file)
@@ -30,14 +30,15 @@ typedef struct procbridge {
 
     int nal_flags;
 
-    pthread_mutex_t nal_cb_lock;
 } *procbridge;
 
 typedef struct nal_init_args {
     ptl_pid_t        nia_requested_pid;
-    ptl_ni_limits_t *nia_limits;
+    ptl_ni_limits_t *nia_requested_limits;
+    ptl_ni_limits_t *nia_actual_limits;
     int              nia_nal_type;
     bridge           nia_bridge;
+    nal_t           *nia_apinal;
 } nal_init_args_t;
 
 extern void *nal_thread(void *);
@@ -50,10 +51,6 @@ extern void *nal_thread(void *);
 #define MAX_PTLS        128
 
 extern void set_address(bridge t,ptl_pid_t pidrequest);
-extern nal_t *procbridge_interface(int num_interface,
-                                   ptl_pt_index_t ptl_size,
-                                   ptl_ac_index_t acl_size,
-                                   ptl_pid_t requested_pid);
 extern void procbridge_wakeup_nal(procbridge p);
 
 #endif
index 2a5ba0d..7ee7c71 100644 (file)
 /* the following functions are stubs to satisfy the nal definition
    without doing anything particularily useful*/
 
-static ptl_err_t nal_write(nal_cb_t *nal,
-                           void *private,
-                           user_ptr dst_addr,
-                           void *src_addr,
-                           size_t len)
-{
-    memcpy(dst_addr, src_addr, len);
-    return PTL_OK;
-}
-
-static ptl_err_t nal_read(nal_cb_t * nal,
-                          void *private,
-                          void *dst_addr,
-                          user_ptr src_addr,
-                          size_t len)
-{
-       memcpy(dst_addr, src_addr, len);
-       return PTL_OK;
-}
-
-static void *nal_malloc(nal_cb_t *nal,
-                        size_t len)
-{
-    void *buf =  malloc(len);
-    return buf;
-}
-
-static void nal_free(nal_cb_t *nal,
-                     void *buf,
-                     size_t len)
-{
-    free(buf);
-}
-
-static void nal_printf(nal_cb_t *nal,
-                       const char *fmt,
-                       ...)
-{
-    va_list        ap;
-
-    va_start(ap, fmt);
-    vprintf(fmt, ap);
-    va_end(ap);
-}
-
-
-static void nal_cli(nal_cb_t *nal,
-                    unsigned long *flags)
-{
-    bridge b = (bridge) nal->nal_data;
-    procbridge p = (procbridge) b->local;
-
-    pthread_mutex_lock(&p->nal_cb_lock);
-}
-
-
-static void nal_sti(nal_cb_t *nal,
-                    unsigned long *flags)
-{
-    bridge b = (bridge)nal->nal_data;
-    procbridge p = (procbridge) b->local;
-
-    pthread_mutex_unlock(&p->nal_cb_lock);
-}
-
-
-static int nal_dist(nal_cb_t *nal,
+static int nal_dist(lib_nal_t *nal,
                     ptl_nid_t nid,
                     unsigned long *dist)
 {
     return 0;
 }
 
-static void wakeup_topside(void *z)
+static void check_stopping(void *z)
 {
     bridge b = z;
     procbridge p = b->local;
-    int stop;
 
+    if ((p->nal_flags & NAL_FLAG_STOPPING) == 0)
+            return;
+    
     pthread_mutex_lock(&p->mutex);
-    stop = p->nal_flags & NAL_FLAG_STOPPING;
-    if (stop)
-        p->nal_flags |= NAL_FLAG_STOPPED;
+    p->nal_flags |= NAL_FLAG_STOPPED;
     pthread_cond_broadcast(&p->cond);
     pthread_mutex_unlock(&p->mutex);
 
-    if (stop)
-        pthread_exit(0);
+    pthread_exit(0);
 }
 
 
@@ -146,9 +79,6 @@ static void wakeup_topside(void *z)
  *  We define a limit macro to place a ceiling on limits
  *   for syntactic convenience
  */
-#define LIMIT(x,y,max)\
-     if ((unsigned int)x > max) y = max;
-
 extern int tcpnal_init(bridge);
 
 nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0};
@@ -159,46 +89,30 @@ void *nal_thread(void *z)
     bridge b = args->nia_bridge;
     procbridge p=b->local;
     int rc;
-    ptl_pid_t pid_request;
+    ptl_process_id_t process_id;
     int nal_type;
-    ptl_ni_limits_t desired;
-    ptl_ni_limits_t actual;
     
-    b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t));
-    b->nal_cb->nal_data=b;
-    b->nal_cb->cb_read=nal_read;
-    b->nal_cb->cb_write=nal_write;
-    b->nal_cb->cb_malloc=nal_malloc;
-    b->nal_cb->cb_free=nal_free;
-    b->nal_cb->cb_map=NULL;
-    b->nal_cb->cb_unmap=NULL;
-    b->nal_cb->cb_printf=nal_printf;
-    b->nal_cb->cb_cli=nal_cli;
-    b->nal_cb->cb_sti=nal_sti;
-    b->nal_cb->cb_dist=nal_dist;
-
-    pid_request = args->nia_requested_pid;
-    desired = *args->nia_limits;
-    nal_type = args->nia_nal_type;
+    b->lib_nal=(lib_nal_t *)malloc(sizeof(lib_nal_t));
+    b->lib_nal->libnal_data=b;
+    b->lib_nal->libnal_map=NULL;
+    b->lib_nal->libnal_unmap=NULL;
+    b->lib_nal->libnal_dist=nal_dist;
 
-    actual = desired;
-    LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES);
-    LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS);
-    LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS);
-    LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS);
-    LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS);
+    nal_type = args->nia_nal_type;
 
-    set_address(b,pid_request);
+    /* Wierd, but this sets b->lib_nal->libnal_ni.ni_pid.{nid,pid}, which
+     * lib_init() is about to do from the process_id passed to it...*/
+    set_address(b,args->nia_requested_pid);
 
+    process_id = b->lib_nal->libnal_ni.ni_pid;
+    
     if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b);
     /* initialize the generic 'library' level code */
 
-    rc = lib_init(b->nal_cb, 
-                  b->nal_cb->ni.nid,
-                  b->nal_cb->ni.pid,
-                 10,
-                 actual.max_ptable_index,
-                 actual.max_atable_index);
+    rc = lib_init(b->lib_nal, args->nia_apinal, 
+                  process_id, 
+                  args->nia_requested_limits, 
+                  args->nia_actual_limits);
 
     /*
      * Whatever the initialization returned is passed back to the
@@ -207,18 +121,17 @@ void *nal_thread(void *z)
      */
     /* this should perform error checking */
     pthread_mutex_lock(&p->mutex);
-    p->nal_flags |= rc ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING;
+    p->nal_flags |= (rc != PTL_OK) ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING;
     pthread_cond_broadcast(&p->cond);
     pthread_mutex_unlock(&p->mutex);
 
-    if (!rc) {
+    if (rc == PTL_OK) {
         /* the thunk function is called each time the timer loop
            performs an operation and returns to blocking mode. we
            overload this function to inform the api side that
            it may be interested in looking at the event queue */
-        register_thunk(wakeup_topside,b);
+        register_thunk(check_stopping,b);
         timer_loop();
     }
     return(0);
 }
-#undef LIMIT
index c4ccae1..09e1542 100644 (file)
 #include <sys/time.h>
 #include <sys/types.h>
 #include <stdlib.h>
+#include <syscall.h>
+#include <pthread.h>
+#include <errno.h>
 #include <pqtimer.h>
 #include <dispatch.h>
+#include <procbridge.h>
 
 
 static struct timeval beginning_of_epoch;
@@ -95,40 +99,22 @@ void remove_io_handler (io_handler i)
     i->disabled=1;
 }
 
-static void set_flag(io_handler n,fd_set *fds)
+static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e)
 {
-    if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]);
-    if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]);
-    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]);
+    if (n->type & READ_HANDLER) FD_SET(n->fd, r);
+    if (n->type & WRITE_HANDLER) FD_SET(n->fd, w);
+    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e);
 }
 
-
-/* Function: select_timer_block
- * Arguments: until: an absolute time when the select should return
- * 
- *   This function dispatches the various file descriptors' handler
- *   functions, if the kernel indicates there is io available.
- */
-void select_timer_block(when until)
+static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e)
 {
-    fd_set fds[3];
-    struct timeval timeout;
-    struct timeval *timeout_pointer;
-    int result;
     io_handler j;
     io_handler *k;
+    int max = 0;
 
-    /* TODO: loop until the entire interval is expired*/
-    if (until){
-       when interval=until-now();
-        timeout.tv_sec=(interval>>32);
-        timeout.tv_usec=((interval<<32)/1000000)>>32;
-        timeout_pointer=&timeout;
-    } else timeout_pointer=0;
-
-    FD_ZERO(&fds[0]);
-    FD_ZERO(&fds[1]);
-    FD_ZERO(&fds[2]);
+    FD_ZERO(r);
+    FD_ZERO(w);
+    FD_ZERO(e);
     for (k=&io_handlers;*k;){
         if ((*k)->disabled){
             j=*k;
@@ -136,24 +122,291 @@ void select_timer_block(when until)
             free(j);
         }
         if (*k) {
-           set_flag(*k,fds);
+           set_flag(*k,r,w,e);
+            if ((*k)->fd > max)
+                max = (*k)->fd;
            k=&(*k)->next;
        }
     }
+    return max + 1;
+}
+
+static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e)
+{
+    io_handler j;
+    int n = 0, t;
+
+    for (j = io_handlers; j; j = j->next) {
+        if (j->disabled)
+            continue;
+
+        t = 0;
+        if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) {
+            FD_CLR(j->fd, r);
+            t++;
+        }
+        if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) {
+            FD_CLR(j->fd, w);
+            t++;
+        }
+        if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) {
+            FD_CLR(j->fd, e);
+            t++;
+        }
+        if (t == 0)
+            continue;
+
+        if (!(*j->function)(j->argument))
+            j->disabled = 1;
+
+        n += t;
+    }
+
+    return n;
+}
 
-    result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer);
+#ifdef ENABLE_SELECT_DISPATCH
 
-    if (result > 0)
-        for (j=io_handlers;j;j=j->next){
-            if (!(j->disabled) && 
-                ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) ||
-                 (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) ||
-                 (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){
-                if (!(*j->function)(j->argument))
-                    j->disabled=1;
+static struct {
+    pthread_mutex_t mutex;
+    pthread_cond_t  cond;
+    int             submitted;
+    int             nready;
+    int             maxfd;
+    fd_set         *rset;
+    fd_set         *wset;
+    fd_set         *eset;
+    struct timeval *timeout;
+    struct timeval  submit_time;
+} fd_extra = {
+    PTHREAD_MUTEX_INITIALIZER,
+    PTHREAD_COND_INITIALIZER,
+    0, 0, 0,
+    NULL, NULL, NULL, NULL,
+};
+
+extern int liblustre_wait_event(int timeout);
+extern procbridge __global_procbridge;
+
+/*
+ * this will intercept syscall select() of user apps
+ * such as MPI libs.
+ */
+int select(int n, fd_set *rset, fd_set *wset, fd_set *eset,
+           struct timeval *timeout)
+{
+    LASSERT(fd_extra.submitted == 0);
+
+    fd_extra.nready = 0;
+    fd_extra.maxfd = n;
+    fd_extra.rset = rset;
+    fd_extra.wset = wset;
+    fd_extra.eset = eset;
+    fd_extra.timeout = timeout;
+
+    liblustre_wait_event(0);
+    pthread_mutex_lock(&fd_extra.mutex);
+    gettimeofday(&fd_extra.submit_time, NULL);
+    fd_extra.submitted = 1;
+    LASSERT(__global_procbridge);
+    procbridge_wakeup_nal(__global_procbridge);
+
+again:
+    if (fd_extra.submitted)
+        pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex);
+    pthread_mutex_unlock(&fd_extra.mutex);
+
+    liblustre_wait_event(0);
+
+    pthread_mutex_lock(&fd_extra.mutex);
+    if (fd_extra.submitted)
+        goto again;
+    pthread_mutex_unlock(&fd_extra.mutex);
+
+    LASSERT(fd_extra.nready >= 0);
+    LASSERT(fd_extra.submitted == 0);
+    return fd_extra.nready;
+}
+
+static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset)
+{
+    int i;
+
+    LASSERT(rset);
+    LASSERT(wset);
+    LASSERT(eset);
+
+    for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) {
+        LASSERT(!fd_extra.rset ||
+                !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i]));
+        LASSERT(!fd_extra.wset ||
+                !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i]));
+        LASSERT(!fd_extra.eset ||
+                !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i]));
+
+        if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i])
+            __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i];
+        if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i])
+            __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i];
+        if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i])
+            __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i];
+    }
+
+    return (fd_extra.maxfd > max ? fd_extra.maxfd : max);
+}
+
+static inline
+int timeval_ge(struct timeval *tv1, struct timeval *tv2)
+{
+    LASSERT(tv1 && tv2);
+    return ((tv1->tv_sec - tv2->tv_sec) * 1000000 +
+            (tv1->tv_usec - tv2->tv_usec) >= 0);
+}
+
+/*
+ * choose the most recent timeout value
+ */
+static struct timeval *choose_timeout(struct timeval *tv1,
+                                      struct timeval *tv2)
+{
+    if (!tv1)
+        return tv2;
+    else if (!tv2)
+        return tv1;
+
+    if (timeval_ge(tv1, tv2))
+        return tv2;
+    else
+        return tv1;
+}
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer, *select_timeout;
+    int max, nready, nexec;
+    int fd_handling;
+
+again:
+    if (until) {
+        when interval;
+
+        interval = until - now();
+        timeout.tv_sec = (interval >> 32);
+        timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+        timeout_pointer = &timeout;
+    } else
+        timeout_pointer = NULL;
+
+    fd_handling = 0;
+    max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+    select_timeout = timeout_pointer;
+
+    pthread_mutex_lock(&fd_extra.mutex);
+    fd_handling = fd_extra.submitted;
+    pthread_mutex_unlock(&fd_extra.mutex);
+    if (fd_handling) {
+        max = merge_fds(max, &fds[0], &fds[1], &fds[2]);
+        select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout);
+    }
+
+    /* XXX only compile for linux */
+#if __WORDSIZE == 64
+    nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2],
+                     select_timeout);
+#else
+    nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2],
+                     select_timeout);
+#endif
+    if (nready < 0) {
+        CERROR("select return err %d, errno %d\n", nready, errno);
+        return;
+    }
+
+    if (nready) {
+        nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]);
+        nready -= nexec;
+    } else
+        nexec = 0;
+
+    /* even both nready & nexec are 0, we still need try to wakeup
+     * upper thread since it may have timed out
+     */
+    if (fd_handling) {
+        LASSERT(nready >= 0);
+
+        pthread_mutex_lock(&fd_extra.mutex);
+        if (nready) {
+            if (fd_extra.rset)
+                *fd_extra.rset = fds[0];
+            if (fd_extra.wset)
+                *fd_extra.wset = fds[1];
+            if (fd_extra.eset)
+                *fd_extra.eset = fds[2];
+            fd_extra.nready = nready;
+            fd_extra.submitted = 0;
+        } else {
+            struct timeval t;
+
+            fd_extra.nready = 0;
+            if (fd_extra.timeout) {
+                gettimeofday(&t, NULL);
+                if (timeval_ge(&t, &fd_extra.submit_time))
+                    fd_extra.submitted = 0;
             }
         }
+
+        pthread_cond_signal(&fd_extra.cond);
+        pthread_mutex_unlock(&fd_extra.mutex);
+    }
+
+    /* haven't found portals event, go back to loop if time
+     * is not expired */
+    if (!nexec) {
+        if (timeout_pointer == NULL || now() >= until)
+            goto again;
+    }
+}
+
+#else /* !ENABLE_SELECT_DISPATCH */
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer;
+    int max, nready;
+
+again:
+    if (until) {
+        when interval;
+        interval = until - now();
+        timeout.tv_sec = (interval >> 32);
+        timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+        timeout_pointer = &timeout;
+    } else
+        timeout_pointer = NULL;
+
+    max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+
+    nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer);
+    if (nready > 0)
+        execute_callbacks(&fds[0], &fds[1], &fds[2]);
 }
+#endif /* ENABLE_SELECT_DISPATCH */
 
 /* Function: init_unix_timer()
  *   is called to initialize the library 
index 0c47f42..abb6d01 100644 (file)
@@ -55,7 +55,7 @@
  *
  * sends a packet to the peer, after insuring that a connection exists
  */
-ptl_err_t tcpnal_send(nal_cb_t *n,
+ptl_err_t tcpnal_send(lib_nal_t *n,
                       void *private,
                       lib_msg_t *cookie,
                       ptl_hdr_t *hdr,
@@ -68,7 +68,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n,
                       size_t len)
 {
     connection c;
-    bridge b=(bridge)n->nal_data;
+    bridge b=(bridge)n->libnal_data;
     struct iovec tiov[257];
     static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER;
     ptl_err_t rc = PTL_OK;
@@ -142,7 +142,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n,
 
 
 /* Function:  tcpnal_recv
- * Arguments: nal_cb_t *nal:     pointer to my nal control block
+ * Arguments: lib_nal_t *nal:    pointer to my nal control block
  *            void *private:     connection pointer passed through
  *                               lib_parse()
  *            lib_msg_t *cookie: passed back to portals library
@@ -154,7 +154,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n,
  * blocking read of the requested data. must drain out the
  * difference of mainpulated and requested lengths from the network
  */
-ptl_err_t tcpnal_recv(nal_cb_t *n,
+ptl_err_t tcpnal_recv(lib_nal_t *n,
                       void *private,
                       lib_msg_t *cookie,
                       unsigned int niov,
@@ -217,7 +217,8 @@ static int from_connection(void *a, void *d)
     ptl_hdr_t hdr;
 
     if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){
-        lib_parse(b->nal_cb, &hdr, c);
+        lib_parse(b->lib_nal, &hdr, c);
+        /*TODO: check error status*/
         return(1);
     }
     return(0);
@@ -239,19 +240,17 @@ int tcpnal_init(bridge b)
 {
     manager m;
         
-    b->nal_cb->cb_send=tcpnal_send;
-    b->nal_cb->cb_recv=tcpnal_recv;
+    b->lib_nal->libnal_send=tcpnal_send;
+    b->lib_nal->libnal_recv=tcpnal_recv;
     b->shutdown=tcpnal_shutdown;
     
-    if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid,
-                                       b->nal_cb->ni.pid),
+    if (!(m=init_connections(PNAL_PORT(b->lib_nal->libnal_ni.ni_pid.nid,
+                                       b->lib_nal->libnal_ni.ni_pid.pid),
                              from_connection,b))){
         /* TODO: this needs to shut down the
            newly created junk */
         return(PTL_NAL_FAILED);
     }
-    /* XXX cfs hack */
-    b->nal_cb->ni.pid=0;
     b->lower=m;
     return(PTL_OK);
 }
index 051bcd9..1d9f905 100644 (file)
@@ -9,14 +9,22 @@
 
 if LIBLUSTRE
 noinst_LIBRARIES = libuptlctl.a
-libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+endif
+
+libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c
 libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS)
 libuptlctl_a_CFLAGS = $(LLCFLAGS)
-endif
 
-if UTILS
-sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
+sbin_PROGRAMS = debugctl
+
 lib_LIBRARIES = libptlctl.a
+
+libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+
+if UTILS
+if !CRAY_PORTALS
+sbin_PROGRAMS += acceptor ptlctl routerstat wirecheck gmnalnid
+endif
 endif
 
 acceptor_SOURCES = acceptor.c
@@ -24,16 +32,15 @@ acceptor_LDADD = $(LIBWRAP)
 
 wirecheck_SOURCES = wirecheck.c
 
-libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
-
 gmnalnid_SOURCES = gmnalnid.c
 
 ptlctl_SOURCES = ptlctl.c
 ptlctl_LDADD =  -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE)
 ptlctl_DEPENDENCIES = libptlctl.a
 
+routerstat_SOURCES = routerstat.c
+
 debugctl_SOURCES = debugctl.c
 debugctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE)
 debugctl_DEPENDENCIES = libptlctl.a
 
-routerstat_SOURCES = routerstat.c
index cff2235..524d128 100644 (file)
@@ -69,83 +69,31 @@ int pidfile_exists(char *name, int port)
         return (0);
 }
 
-int
-parse_size (int *sizep, char *str)
-{
-        int             size;
-        char            mod[32];
-
-        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod))
-        {
-        default:
-                return (-1);
-
-        case 1:
-                *sizep = size;
-                return (0);
-
-        case 2:
-                switch (*mod)
-                {
-                case 'g':
-                case 'G':
-                        *sizep = size << 30;
-                        return (0);
-
-                case 'm':
-                case 'M':
-                        *sizep = size << 20;
-                        return (0);
-
-                case 'k':
-                case 'K':
-                        *sizep = size << 10;
-                        return (0);
-
-                default:
-                        *sizep = size;
-                        return (0);
-                }
-        }
-}
-
 void
 show_connection (int fd, __u32 net_ip)
 {
         struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET);
         __u32 host_ip = ntohl (net_ip);
-        int  rxmem = 0;
-        int  txmem = 0;
-        int  nonagle = 0;
         int  len;
         char host[1024];
         
-        len = sizeof (txmem);
-        if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &len) != 0)
-                perror ("Cannot get write buffer size");
-        
-        len = sizeof (rxmem);
-        if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &len) != 0)
-                perror ("Cannot get read buffer size");
-        
-        len = sizeof (nonagle);
-        if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &len) != 0)
-                perror ("Cannot get nagle");
-
         if (h == NULL)
                 snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff,
                                     (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff);
         else
                 snprintf (host, sizeof(host), "%s", h->h_name);
                 
-        syslog (LOG_INFO, "Accepted host: %s snd: %d rcv %d nagle: %s\n", 
-                host, txmem, rxmem, nonagle ? "disabled" : "enabled");
+        syslog (LOG_INFO, "Accepted host: %s\n", host);
 }
 
 void
 usage (char *myname)
 {
-        fprintf (stderr, "Usage: %s [-r recv_mem] [-s send_mem] [-n] [-p] [-N nal_id] port\n", myname);
+        fprintf (stderr, 
+                 "Usage: %s [-N nal_id] [-p] [-l] port\n\n"
+                 " -l\tKeep stdin/stdout open\n"
+                 " -p\tAllow connections from non-privileged ports\n",
+                 myname);
         exit (1);
 }
 
@@ -154,52 +102,29 @@ int main(int argc, char **argv)
         int o, fd, rc, port, pfd;
         struct sockaddr_in srvaddr;
         int c;
-        int rxmem = 0;
-        int txmem = 0;
         int noclose = 0;
-        int nonagle = 1;
         int nal = SOCKNAL;
-        int bind_irq = 0; 
         int rport;
         int require_privports = 1;
         
-        while ((c = getopt (argc, argv, "N:pr:s:nli")) != -1)
-                switch (c)
-                {
-                case 'r':
-                        if (parse_size (&rxmem, optarg) != 0 || rxmem < 0)
-                                usage (argv[0]);
-                        break;
-                        
-                case 's':
-                        if (parse_size (&txmem, optarg) != 0 || txmem < 0)
-                                usage (argv[0]);
-                        break;
-
-                case 'n':
-                        nonagle = 0;
+        while ((c = getopt (argc, argv, "N:lp")) != -1) {
+                switch (c) {
+                case 'N':
+                        if (sscanf(optarg, "%d", &nal) != 1 ||
+                            nal < 0 || nal > NAL_MAX_NR)
+                                usage(argv[0]);
                         break;
-
                 case 'l':
                         noclose = 1;
                         break;
-
-                case 'i':
-                        bind_irq = 1;
-                        break;
                 case 'p':
                         require_privports = 0;
                         break;
-                case 'N':
-                        if (parse_size(&nal, optarg) != 0 || 
-                            nal < 0 || nal > NAL_MAX_NR)
-                                usage(argv[0]);
-                        break;
-                        
                 default:
                         usage (argv[0]);
                         break;
                 }
+        }
 
         if (optind >= argc)
                 usage (argv[0]);
@@ -226,37 +151,6 @@ int main(int argc, char **argv)
                 exit(1);
         }
 
-        if (nonagle)
-        {
-                o = 1;
-                rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o));
-                if (rc != 0) 
-                { 
-                        perror ("Cannot disable nagle");
-                        exit (1);
-                }
-        }
-
-        if (txmem != 0)
-        {
-                rc = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, sizeof (txmem));
-                if (rc != 0)
-                {
-                        perror ("Cannot set write buffer size");
-                        exit (1);
-                }
-        }
-        
-        if (rxmem != 0)
-        {
-                rc = setsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, sizeof (rxmem));
-                if (rc != 0)
-                {
-                        perror ("Cannot set read buffer size");
-                        exit (1);
-               }
-        }
-                
         rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
         if ( rc == -1 ) {
                 perror("bind: ");
@@ -291,12 +185,11 @@ int main(int argc, char **argv)
                 int cfd;
                 struct portal_ioctl_data data;
                 struct portals_cfg pcfg;
-                int    privileged = 0;
-                char addrstr[INET_ADDRSTRLEN];
 #ifdef HAVE_LIBWRAP
                 struct request_info request;
 #endif
-
+                char addrstr[INET_ADDRSTRLEN];
+               
                 cfd = accept(fd, (struct sockaddr *)&clntaddr, &len);
                 if ( cfd < 0 ) {
                         perror("accept");
@@ -304,7 +197,6 @@ int main(int argc, char **argv)
                         continue;
                 }
 
-                rport = ntohs(clntaddr.sin_port);
 #ifdef HAVE_LIBWRAP
                 /* libwrap access control */
                 request_init(&request, RQ_DAEMON, "lustre", RQ_FILE, cfd, 0);
@@ -313,18 +205,20 @@ int main(int argc, char **argv)
                         inet_ntop(AF_INET, &clntaddr.sin_addr,
                                   addrstr, INET_ADDRSTRLEN);
                         syslog(LOG_WARNING, "Unauthorized access from %s:%hd\n",
-                               addrstr, rport);
+                               addrstr, ntohs(clntaddr.sin_port));
                         close (cfd);
                         continue;
                 }
 #endif
 
-                if (require_privports && rport >= IPPORT_RESERVED) {
+                if (require_privports && ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) {
                         inet_ntop(AF_INET, &clntaddr.sin_addr,
                                   addrstr, INET_ADDRSTRLEN);
-                        syslog(LOG_ERR,  "Closing non-privileged connection from %s:%d\n",
-                               addrstr, rport);
-                        close(cfd);
+                        syslog(LOG_ERR, "Closing non-privileged connection from %s:%d\n",
+                               addrstr, ntohs(clntaddr.sin_port));
+                        rc = close(cfd);
+                        if (rc)
+                                perror ("close un-privileged client failed");
                         continue;
                 }
 
@@ -333,13 +227,12 @@ int main(int argc, char **argv)
                 PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD);
                 pcfg.pcfg_nal = nal;
                 pcfg.pcfg_fd = cfd;
-                pcfg.pcfg_flags = bind_irq;
                 pcfg.pcfg_misc = SOCKNAL_CONN_NONE; /* == incoming connection */
-
+                
                 PORTAL_IOC_INIT(data);
                 data.ioc_pbuf1 = (char*)&pcfg;
                 data.ioc_plen1 = sizeof(pcfg);
-
+                
                 if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) {
                         perror("ioctl failed");
                 } else {
index 538af44..afbf1cb 100644 (file)
  */
 
 #define __USE_FILE_OFFSET64
+#define  _GNU_SOURCE
 
 #include <portals/list.h>
 
 #include <stdio.h>
+#ifdef HAVE_NETDB_H
 #include <netdb.h>
+#endif
 #include <stdlib.h>
 #include <string.h>
+#include "ioctl.h"
 #include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
-#include <time.h>
 #ifndef __CYGWIN__
 # include <syscall.h>
 #endif
 #include <sys/stat.h>
 #include <sys/mman.h>
 
+#ifdef HAVE_LINUX_VERSION_H
 #include <linux/version.h>
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #define BUG()                            /* workaround for module.h includes */
 #include <linux/module.h>
 #endif
+#endif /* !HAVE_LINUX_VERSION_H */
+
+#include <sys/utsname.h>
 
 #include <portals/api-support.h>
 #include <portals/ptlctl.h>
 #include "parser.h"
 
+#include <time.h>
+
 static char rawbuf[8192];
 static char *buf = rawbuf;
 static int max = 8192;
-//static int g_pfd = -1;
+/*static int g_pfd = -1;*/
 static int subsystem_mask = ~0;
 static int debug_mask = ~0;
 
 #define MAX_MARK_SIZE 100
 
 static const char *portal_debug_subsystems[] =
-        {"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite",
-         "rpc", "mgmt", "portals", "socknal", "qswnal", "pinger", "filter",
-         "ptlbd", "echo", "ldlm", "lov", "gmnal", "router", "cobd", "ibnal",
-         NULL};
+        {"undefined", "mdc", "mds", "osc", 
+         "ost", "class", "log", "llite",
+         "rpc", "mgmt", "portals", "socknal", 
+         "qswnal", "pinger", "filter", "ptlbd", 
+         "echo", "ldlm", "lov", "gmnal",
+         "router", "cobd", "ibnal", "sm",
+         "asobd", "confobd", NULL};
 static const char *portal_debug_masks[] =
-        {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
-         "blocks", "net", "warning", "buffs", "other", "dentry", "portals",
-         "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace",
-         "reada", NULL};
+        {"trace", "inode", "super", "ext2", 
+         "malloc", "cache", "info", "ioctl",
+         "blocks", "net", "warning", "buffs", 
+         "other", "dentry", "portals", "page", 
+         "dlmtrace", "error", "emerg", "ha", 
+         "rpctrace", "vfstrace", "reada", NULL};
 
 struct debug_daemon_cmd {
         char *cmd;
@@ -183,9 +197,6 @@ static int applymask(char* procpath, int value)
         return 0;
 }
 
-extern char *dump_filename;
-extern int dump(int dev_id, int opc, void *buf);
-
 static void applymask_all(unsigned int subs_mask, unsigned int debug_mask)
 {
         if (!dump_filename) {
@@ -243,7 +254,7 @@ struct dbg_line {
 static void list_add_ordered(struct dbg_line *new, struct list_head *head)
 {
         struct list_head *pos;
-        struct dbg_line *curr, *next;
+        struct dbg_line *curr;
 
         list_for_each(pos, head) {
                 curr = list_entry(pos, struct dbg_line, chain);
@@ -289,7 +300,7 @@ static int parse_buffer(FILE *in, FILE *out)
         char buf[4097], *p;
         int rc;
         unsigned long dropped = 0, kept = 0;
-        struct list_head chunk_list, *pos;
+        struct list_head chunk_list;
 
         INIT_LIST_HEAD(&chunk_list);
 
@@ -371,15 +382,24 @@ int jt_dbg_debug_kernel(int argc, char **argv)
                 fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]);
                 return 0;
         }
-        sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : "/tmp/lustre-log",
-                time(NULL), getpid());
 
-        if (argc > 2)
+        if (argc > 2) {
                 raw = atoi(argv[2]);
+        } else if (argc > 1 && (argv[1][0] == '0' || argv[1][0] == '1')) {
+                raw = atoi(argv[1]);
+                argc--;
+        } else {
+                sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] :
+                        "/tmp/lustre-log", time(NULL), getpid());
+        }
+
         unlink(filename);
 
         fd = open("/proc/sys/portals/dump_kernel", O_WRONLY);
         if (fd < 0) {
+                if (errno == ENOENT) /* no dump file created */
+                        return 0;
+
                 fprintf(stderr, "open(dump_kernel) failed: %s\n",
                         strerror(errno));
                 return 1;
@@ -411,11 +431,15 @@ int jt_dbg_debug_kernel(int argc, char **argv)
                 if (out == NULL) {
                         fprintf(stderr, "fopen(%s) failed: %s\n", argv[1],
                                 strerror(errno));
+                        fclose(in);
                         return 1;
                 }
         }
 
         rc = parse_buffer(in, out);
+        fclose(in);
+        if (argc > 1)
+                fclose(out);
         if (rc) {
                 fprintf(stderr, "parse_buffer failed; leaving tmp file %s "
                         "behind.\n", filename);
@@ -431,23 +455,40 @@ int jt_dbg_debug_kernel(int argc, char **argv)
 
 int jt_dbg_debug_file(int argc, char **argv)
 {
+        int fdin,fdout;
         FILE *in, *out = stdout;
         if (argc > 3 || argc < 2) {
                 fprintf(stderr, "usage: %s <input> [output]\n", argv[0]);
                 return 0;
         }
 
-        in = fopen(argv[1], "r");
+        fdin = open(argv[1], O_RDONLY | O_LARGEFILE);
+        if (fdin == -1) {
+                fprintf(stderr, "open(%s) failed: %s\n", argv[1],
+                        strerror(errno));
+                return 1;
+        }
+        in = fdopen(fdin, "r");
         if (in == NULL) {
                 fprintf(stderr, "fopen(%s) failed: %s\n", argv[1],
                         strerror(errno));
+                close(fdin);
                 return 1;
         }
         if (argc > 2) {
-                out = fopen(argv[2], "w");
+                fdout = open(argv[2], O_CREAT | O_WRONLY | O_LARGEFILE);
+                if (fdout == -1) {
+                        fprintf(stderr, "open(%s) failed: %s\n", argv[2],
+                                strerror(errno));
+                        fclose(in);
+                        return 1;
+                }
+                out = fdopen(fdout, "w");
                 if (out == NULL) {
                         fprintf(stderr, "fopen(%s) failed: %s\n", argv[2],
                                 strerror(errno));
+                        fclose(in);
+                        close(fdout);
                         return 1;
                 }
         }
@@ -489,7 +530,8 @@ int jt_dbg_debug_daemon(int argc, char **argv)
                                 strncat(size, argv[3], sizeof(size) - 6);
                                 rc = write(fd, size, strlen(size));
                                 if (rc != strlen(size)) {
-                                        fprintf(stderr, "set %s failed: %s\n",                                                 size, strerror(errno));
+                                        fprintf(stderr, "set %s failed: %s\n",
+                                                size, strerror(errno));
                                 }
                         }
                 }
@@ -590,7 +632,8 @@ int jt_dbg_mark_debug_buf(int argc, char **argv)
 static struct mod_paths {
         char *name, *path;
 } mod_paths[] = {
-        {"portals", "lustre/portals/libcfs"},
+        {"libcfs", "lustre/portals/libcfs"},
+        {"portals", "lustre/portals/portals"},
         {"ksocknal", "lustre/portals/knals/socknal"},
         {"kptlrouter", "lustre/portals/router"},
         {"lvfs", "lustre/lvfs"},
@@ -603,6 +646,7 @@ static struct mod_paths {
         {"mds", "lustre/mds"},
         {"mdc", "lustre/mdc"},
         {"llite", "lustre/llite"},
+        {"smfs", "lustre/smfs"},
         {"obdecho", "lustre/obdecho"},
         {"ldlm", "lustre/ldlm"},
         {"obdfilter", "lustre/obdfilter"},
@@ -611,18 +655,22 @@ static struct mod_paths {
         {"fsfilt_ext3", "lustre/lvfs"},
         {"fsfilt_extN", "lustre/lvfs"},
         {"fsfilt_reiserfs", "lustre/lvfs"},
+        {"fsfilt_smfs", "lustre/lvfs"},
+        {"fsfilt_ldiskfs", "lustre/lvfs"},
         {"mds_ext2", "lustre/mds"},
         {"mds_ext3", "lustre/mds"},
         {"mds_extN", "lustre/mds"},
         {"ptlbd", "lustre/ptlbd"},
         {"mgmt_svc", "lustre/mgmt"},
         {"mgmt_cli", "lustre/mgmt"},
+        {"conf_obd", "lustre/obdclass"},
         {NULL, NULL}
 };
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-int jt_dbg_modules(int argc, char **argv)
+static int jt_dbg_modules_2_4(int argc, char **argv)
 {
+#ifdef HAVE_LINUX_VERSION_H
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         struct mod_paths *mp;
         char *path = "..";
         char *kernel = "linux";
@@ -657,9 +705,12 @@ int jt_dbg_modules(int argc, char **argv)
         }
 
         return 0;
+#endif /* Headers are 2.6-only */
+#endif /* !HAVE_LINUX_VERSION_H */
+        return -EINVAL;
 }
-#else
-int jt_dbg_modules(int argc, char **argv)
+
+static int jt_dbg_modules_2_5(int argc, char **argv)
 {
         struct mod_paths *mp;
         char *path = "..";
@@ -699,7 +750,26 @@ int jt_dbg_modules(int argc, char **argv)
 
         return 0;
 }
-#endif /* linux 2.5 */
+
+int jt_dbg_modules(int argc, char **argv)
+{
+        int rc = 0;
+        struct utsname sysinfo;
+
+        rc = uname(&sysinfo);
+        if (rc) {
+                printf("uname() failed: %s\n", strerror(errno));
+                return 0;
+        }
+
+        if (sysinfo.release[2] > '4') {
+                return jt_dbg_modules_2_5(argc, argv);
+        } else {
+                return jt_dbg_modules_2_4(argc, argv);
+        }
+
+        return 0;
+}
 
 int jt_dbg_panic(int argc, char **argv)
 {
index 1adcc8e..0671c24 100644 (file)
@@ -56,7 +56,7 @@ static struct ioc_dev ioc_dev_list[10];
 struct dump_hdr {
        int magic;
        int dev_id;
-       int opc;
+        unsigned int opc;
 };
 
 char *dump_filename;
@@ -101,7 +101,7 @@ open_ioc_dev(int dev_id)
 
 
 static int 
-do_ioctl(int dev_id, int opc, void *buf)
+do_ioctl(int dev_id, unsigned int opc, void *buf)
 {
        int fd, rc;
        
@@ -131,7 +131,7 @@ get_dump_file()
  * used, but for now it will assumed whatever app reads the file will
  * know what to do. */
 int 
-dump(int dev_id, int opc, void *buf)
+dump(int dev_id, unsigned int opc, void *buf)
 {
        FILE *fp;
        struct dump_hdr dump_hdr;
@@ -212,7 +212,7 @@ set_ioctl_dump(char * file)
 }
 
 int
-l_ioctl(int dev_id, int opc, void *buf)
+l_ioctl(int dev_id, unsigned int opc, void *buf)
 {
         return current_ioc_handler(dev_id, opc, buf);
 }
@@ -226,7 +226,7 @@ l_ioctl(int dev_id, int opc, void *buf)
  * each device used in the dump.
  */
 int 
-parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
+parse_dump(char * dump_file, ioc_handler_t ioc_func)
 {
        int line =0;
        struct stat st;
index 82b4022..b91295b 100644 (file)
@@ -642,68 +642,6 @@ int Parser_arg2int(const char *inp, long *result, int base)
                 return 1;
 }
 
-/* Convert human readable size string to and int; "1k" -> 1000 */
-int Parser_size (int *sizep, char *str) {
-        int size;
-        char mod[32];
-
-        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
-        default:
-                return (-1);
-
-        case 1:
-                *sizep = size;
-                return (0);
-
-        case 2:
-                switch (*mod) {
-                case 'g':
-                case 'G':
-                        *sizep = size << 30;
-                        return (0);
-
-                case 'm':
-                case 'M':
-                        *sizep = size << 20;
-                        return (0);
-
-                case 'k':
-                case 'K':
-                        *sizep = size << 10;
-                        return (0);
-
-                default:
-                        *sizep = size;
-                        return (0);
-                }
-        }
-}
-
-/* Convert a string boolean to an int; "enable" -> 1 */
-int Parser_bool (int *b, char *str) {
-        if (!strcasecmp (str, "no") ||
-            !strcasecmp (str, "n") ||
-            !strcasecmp (str, "off") ||
-            !strcasecmp (str, "down") ||
-            !strcasecmp (str, "disable"))
-        {
-                *b = 0;
-                return (0);
-        }
-        
-        if (!strcasecmp (str, "yes") ||
-            !strcasecmp (str, "y") ||
-            !strcasecmp (str, "on") ||
-            !strcasecmp (str, "up") ||
-            !strcasecmp (str, "enable"))
-        {
-                *b = 1;
-                return (0);
-        }
-        
-        return (-1);
-}
-
 int Parser_quit(int argc, char **argv)
 {
         argc = argc;
index 44e8f2a..9e7e95a 100644 (file)
@@ -64,10 +64,4 @@ char *Parser_strarg(char *inp, const char *prompt, const char *deft,
 /* Extracts an integer from a string  with a base */
 int Parser_arg2int(const char *inp, long *result, int base);
 
-/* Convert human readable size string to and int; "1k" -> 1000 */
-int Parser_size(int *sizep, char *str);
-
-/* Convert a string boolean to an int; "enable" -> 1 */
-int Parser_bool(int *b, char *str);
-
 #endif
index 6025ee6..d5d29dc 100644 (file)
 
 #include <stdio.h>
 #include <sys/types.h>
+#ifdef HAVE_NETDB_H
 #include <netdb.h>
+#endif
 #include <sys/socket.h>
+#ifdef HAVE_NETINET_TCP_H
 #include <netinet/tcp.h>
-#include <netdb.h>
+#endif
 #include <stdlib.h>
 #include <string.h>
 #include <fcntl.h>
+#include "ioctl.h"
 #include <sys/ioctl.h>
 #include <errno.h>
 #include <unistd.h>
 
 #include <netinet/in.h>
 
-#warning assuming little endian
-
-#define __cpu_to_le64(x) ((__u64)(x))
-#define __le64_to_cpu(x) ((__u64)(x))
-#define __cpu_to_le32(x) ((__u32)(x))
-#define __le32_to_cpu(x) ((__u32)(x))
-#define __cpu_to_le16(x) ((__u16)(x))
-#define __le16_to_cpu(x) ((__u16)(x))
-
 #endif /* __CYGWIN__ */
  
 #include <portals/api-support.h>
 
 unsigned int portal_debug;
 unsigned int portal_printk;
-unsigned int portal_stack;
 
 static unsigned int g_nal = 0;
 
-static int g_socket_txmem = 0;
-static int g_socket_rxmem = 0;
-static int g_socket_nonagle = 1;
-
 typedef struct
 {
         char *name;
@@ -79,13 +69,75 @@ static name2num_t nalnames[] = {
         {"tcp",                SOCKNAL},
         {"elan",       QSWNAL},
         {"gm",         GMNAL},
-        {"ib",         IBNAL},
-        {"scimac",      SCIMACNAL},
+        {"openib",      OPENIBNAL},
+        {"iib",         IIBNAL},
         {NULL,         -1}
 };
 
 static cfg_record_cb_t g_record_cb;
 
+/* Convert a string boolean to an int; "enable" -> 1 */
+int ptl_parse_bool (int *b, char *str) {
+        if (!strcasecmp (str, "no") ||
+            !strcasecmp (str, "n") ||
+            !strcasecmp (str, "off") ||
+            !strcasecmp (str, "down") ||
+            !strcasecmp (str, "disable"))
+        {
+                *b = 0;
+                return (0);
+        }
+        
+        if (!strcasecmp (str, "yes") ||
+            !strcasecmp (str, "y") ||
+            !strcasecmp (str, "on") ||
+            !strcasecmp (str, "up") ||
+            !strcasecmp (str, "enable"))
+        {
+                *b = 1;
+                return (0);
+        }
+        
+        return (-1);
+}
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int ptl_parse_size (int *sizep, char *str) {
+        int size;
+        char mod[32];
+
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
+        default:
+                return (-1);
+
+        case 1:
+                *sizep = size;
+                return (0);
+
+        case 2:
+                switch (*mod) {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
+}
+
 int 
 ptl_set_cfg_record_cb(cfg_record_cb_t cb)
 {
@@ -158,6 +210,7 @@ nal2name (int nal)
         return ((e == NULL) ? "???" : e->name);
 }
 
+#ifdef HAVE_GETHOSTBYNAME
 static struct hostent *
 ptl_gethostbyname(char * hname) {
         struct hostent *he;
@@ -178,6 +231,7 @@ ptl_gethostbyname(char * hname) {
         }
         return he;
 }
+#endif
 
 int
 ptl_parse_port (int *port, char *str)
@@ -223,20 +277,13 @@ ptl_parse_time (time_t *t, char *str)
 }
 
 int
-ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
+ptl_parse_ipquad (__u32 *ipaddrp, char *str)
 {
-        struct hostent *he;
         int             a;
         int             b;
         int             c;
         int             d;
 
-        if (!strcmp (str, "_all_")) 
-        {
-                *ipaddrp = 0;
-                return (0);
-        }
-
         if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 &&
             (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
             (c & ~0xff) == 0 && (d & ~0xff) == 0)
@@ -244,7 +291,27 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
                 *ipaddrp = (a<<24)|(b<<16)|(c<<8)|d;
                 return (0);
         }
-        
+
+        return (-1);
+}
+
+int
+ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
+{
+#ifdef HAVE_GETHOSTBYNAME
+        struct hostent *he;
+#endif
+
+        if (!strcmp (str, "_all_")) 
+        {
+                *ipaddrp = 0;
+                return (0);
+        }
+
+        if (ptl_parse_ipquad(ipaddrp, str) == 0)
+                return (0);
+
+#if HAVE_GETHOSTBYNAME        
         if ((('a' <= str[0] && str[0] <= 'z') ||
              ('A' <= str[0] && str[0] <= 'Z')) &&
              (he = ptl_gethostbyname (str)) != NULL)
@@ -254,21 +321,28 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
                 *ipaddrp = ntohl(addr);         /* HOST byte order */
                 return (0);
         }
+#endif
 
         return (-1);
 }
 
 char *
-ptl_ipaddr_2_str (__u32 ipaddr, char *str)
+ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup)
 {
+#ifdef HAVE_GETHOSTBYNAME
         __u32           net_ip;
         struct hostent *he;
-        
-        net_ip = htonl (ipaddr);
-        he = gethostbyaddr (&net_ip, sizeof (net_ip), AF_INET);
-        if (he != NULL)
-                return (he->h_name);
-        
+
+        if (lookup) {
+                net_ip = htonl (ipaddr);
+                he = gethostbyaddr (&net_ip, sizeof (net_ip), AF_INET);
+                if (he != NULL) {
+                        strcpy(str, he->h_name);
+                        return (str);
+                }
+        }
+#endif
+
         sprintf (str, "%d.%d.%d.%d",
                  (ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff,
                  (ipaddr >> 8) & 0xff, ipaddr & 0xff);
@@ -302,22 +376,42 @@ ptl_parse_nid (ptl_nid_t *nidp, char *str)
         return (-1);
 }
 
+__u64 ptl_nid2u64(ptl_nid_t nid)
+{
+        switch (sizeof (nid)) {
+        case 8:
+                return (nid);
+        case 4:
+                return ((__u32)nid);
+        default:
+                fprintf(stderr, "Unexpected sizeof(ptl_nid_t) == %u\n", sizeof(nid));
+                abort();
+                /* notreached */
+                return (-1);
+        }
+}
+
 char *
 ptl_nid2str (char *buffer, ptl_nid_t nid)
 {
-        struct hostent *he = NULL;
+        __u64           nid64 = ptl_nid2u64(nid);
+#ifdef HAVE_GETHOSTBYNAME
+        struct hostent *he = 0;
 
         /* Don't try to resolve NIDs that are e.g. Elan host IDs.  Assume
          * TCP addresses in the 0.x.x.x subnet are not in use.  This can
          * happen on routers and slows things down a _lot_.  Bug 3442. */
         if (nid & 0xff000000) {
                 __u32 addr = htonl((__u32)nid); /* back to NETWORK byte order */
-                he = gethostbyaddr((const char *)&addr, sizeof(addr), AF_INET);
+
+                he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET);
         }
+
         if (he != NULL)
-                sprintf(buffer, "%#x:%s", (int)(nid >> 32), he->h_name);
+                sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name);
         else
-                sprintf(buffer, LPX64, nid);
+#endif /* HAVE_GETHOSTBYNAME */
+                sprintf(buffer, LPX64, nid64);
 
         return (buffer);
 }
@@ -441,11 +535,11 @@ int jt_ptl_network(int argc, char **argv)
         return (-1);
 }
 
-int 
-jt_ptl_print_autoconnects (int argc, char **argv)
+int
+jt_ptl_print_interfaces (int argc, char **argv)
 {
-        struct portals_cfg        pcfg;
-        char                     buffer[64];
+        struct portals_cfg       pcfg;
+        char                     buffer[3][64];
         int                      index;
         int                      rc;
 
@@ -453,99 +547,193 @@ jt_ptl_print_autoconnects (int argc, char **argv)
                 return -1;
 
         for (index = 0;;index++) {
-                PCFG_INIT (pcfg, NAL_CMD_GET_AUTOCONN);
-                pcfg.pcfg_count   = index;
+                PCFG_INIT (pcfg, NAL_CMD_GET_INTERFACE);
+                pcfg.pcfg_count = index;
 
                 rc = pcfg_ioctl (&pcfg);
                 if (rc != 0)
                         break;
 
-                printf (LPX64"@%s:%d #%d buffer %d "
-                        "nonagle %s affinity %s eager %s share %d\n",
-                        pcfg.pcfg_nid, ptl_ipaddr_2_str (pcfg.pcfg_id, buffer),
-                        pcfg.pcfg_misc, pcfg.pcfg_count, pcfg.pcfg_size, 
-                        (pcfg.pcfg_flags & 1) ? "on" : "off",
-                        (pcfg.pcfg_flags & 2) ? "on" : "off",
-                        (pcfg.pcfg_flags & 4) ? "on" : "off",
-                        pcfg.pcfg_wait);
+                printf ("%s: (%s/%s) npeer %d nroute %d\n",
+                        ptl_ipaddr_2_str(pcfg.pcfg_id, buffer[2], 1),
+                        ptl_ipaddr_2_str(pcfg.pcfg_id, buffer[0], 0),
+                        ptl_ipaddr_2_str(pcfg.pcfg_misc, buffer[1], 0),
+                        pcfg.pcfg_fd, pcfg.pcfg_count);
         }
 
         if (index == 0)
-                printf ("<no autoconnect routes>\n");
+                printf ("<no interfaces>\n");
         return 0;
 }
 
-int 
-jt_ptl_add_autoconnect (int argc, char **argv)
+int
+jt_ptl_add_interface (int argc, char **argv)
 {
-        struct portals_cfg        pcfg;
-        ptl_nid_t                nid;
-        __u32                    ip;
-        int                      port;
-        int                      irq_affinity = 0;
-        int                      share = 0;
-        int                      eager = 0;
+        struct portals_cfg       pcfg;
+        __u32                    ipaddr;
         int                      rc;
+        __u32                    netmask = 0xffffff00;
+        int                      i;
+        int                      count;
+        char                    *end;
 
-        if (argc < 4 || argc > 5) {
-                fprintf (stderr, "usage: %s nid ipaddr port [ise]\n", argv[0]);
+        if (argc < 2 || argc > 3) {
+                fprintf (stderr, "usage: %s ipaddr [netmask]\n", argv[0]);
                 return 0;
         }
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, 0))
+        if (!g_nal_is_compatible(argv[0], SOCKNAL, 0))
                 return -1;
 
-        if (ptl_parse_nid (&nid, argv[1]) != 0 ||
-                nid == PTL_NID_ANY) {
-                fprintf (stderr, "Can't parse NID: %s\n", argv[1]);
+        if (ptl_parse_ipaddr(&ipaddr, argv[1]) != 0) {
+                fprintf (stderr, "Can't parse ip: %s\n", argv[1]);
                 return -1;
         }
 
-        if (ptl_parse_ipaddr (&ip, argv[2]) != 0) {
-                fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]);
+        if (argc > 2 ) {
+                count = strtol(argv[2], &end, 0);
+                if (count > 0 && count < 32 && *end == 0) {
+                        netmask = 0;
+                        for (i = count; i > 0; i--)
+                                netmask = netmask|(1<<(32-i));
+                } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) {
+                        fprintf (stderr, "Can't parse netmask: %s\n", argv[2]);
+                        return -1;
+                }
+        }
+
+        PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE);
+        pcfg.pcfg_id     = ipaddr;
+        pcfg.pcfg_misc   = netmask;
+
+        rc = pcfg_ioctl (&pcfg);
+        if (rc != 0) {
+                fprintf (stderr, "failed to add interface: %s\n",
+                         strerror (errno));
                 return -1;
         }
 
-        if (ptl_parse_port (&port, argv[3]) != 0) {
-                fprintf (stderr, "Can't parse port: %s\n", argv[3]);
+        return 0;
+}
+
+int
+jt_ptl_del_interface (int argc, char **argv)
+{
+        struct portals_cfg       pcfg;
+        int                      rc;
+        __u32                    ipaddr = 0;
+
+        if (argc > 2) {
+                fprintf (stderr, "usage: %s [ipaddr]\n", argv[0]);
+                return 0;
+        }
+
+        if (!g_nal_is_compatible(argv[0], SOCKNAL, 0))
+                return -1;
+
+        if (argc == 2 &&
+            ptl_parse_ipaddr(&ipaddr, argv[1]) != 0) {
+                fprintf (stderr, "Can't parse ip: %s\n", argv[1]);
                 return -1;
         }
+        
+        PCFG_INIT(pcfg, NAL_CMD_DEL_INTERFACE);
+        pcfg.pcfg_id = ipaddr;
 
-        if (argc > 4) {
-                char *opts = argv[4];
-                
-                while (*opts != 0)
-                        switch (*opts++) {
-                        case 'i':
-                                irq_affinity = 1;
-                                break;
-                        case 's':
-                                share = 1;
-                                break;
-                        case 'e':
-                                eager = 1;
-                                break;
-                        default:
-                                fprintf (stderr, "Can't parse options: %s\n",
-                                         argv[4]);
-                                return -1;
-                        }
+        rc = pcfg_ioctl (&pcfg);
+        if (rc != 0) {
+                fprintf (stderr, "failed to delete interface: %s\n",
+                         strerror (errno));
+                return -1;
         }
 
-        PCFG_INIT(pcfg, NAL_CMD_ADD_AUTOCONN);
+        return 0;
+}
+
+int
+jt_ptl_print_peers (int argc, char **argv)
+{
+        struct portals_cfg       pcfg;
+        char                     buffer[2][64];
+        int                      index;
+        int                      rc;
+
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
+                return -1;
+
+        for (index = 0;;index++) {
+                PCFG_INIT (pcfg, NAL_CMD_GET_PEER);
+                pcfg.pcfg_count   = index;
+
+                rc = pcfg_ioctl (&pcfg);
+                if (rc != 0)
+                        break;
+
+                if (g_nal_is_compatible(NULL, SOCKNAL, 0))
+                        printf (LPX64"[%d]%s@%s:%d #%d\n",
+                                pcfg.pcfg_nid, pcfg.pcfg_wait,
+                                ptl_ipaddr_2_str (pcfg.pcfg_size, buffer[0], 1),
+                                ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1),
+                                pcfg.pcfg_misc, pcfg.pcfg_count);
+                else
+                        printf (LPX64"[%d]\n",
+                                pcfg.pcfg_nid, pcfg.pcfg_wait);
+        }
+
+        if (index == 0)
+                printf ("<no peers>\n");
+        return 0;
+}
+
+int 
+jt_ptl_add_peer (int argc, char **argv)
+{
+        struct portals_cfg       pcfg;
+        ptl_nid_t                nid;
+        __u32                    ip = 0;
+        int                      port = 0;
+        int                      rc;
+
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
+                return -1;
+
+        if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
+                if (argc != 4) {
+                        fprintf (stderr, "usage(tcp): %s nid ipaddr port\n", 
+                                 argv[0]);
+                        return 0;
+                }
+        } else if (argc != 2) {
+                fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]);
+                return 0;
+        }
+
+        if (ptl_parse_nid (&nid, argv[1]) != 0 ||
+                nid == PTL_NID_ANY) {
+                fprintf (stderr, "Can't parse NID: %s\n", argv[1]);
+                return -1;
+        }
+
+        if (g_nal_is_compatible (NULL, SOCKNAL, 0)) {
+                if (ptl_parse_ipaddr (&ip, argv[2]) != 0) {
+                        fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]);
+                        return -1;
+                }
+
+                if (ptl_parse_port (&port, argv[3]) != 0) {
+                        fprintf (stderr, "Can't parse port: %s\n", argv[3]);
+                        return -1;
+                }
+        }
+
+        PCFG_INIT(pcfg, NAL_CMD_ADD_PEER);
         pcfg.pcfg_nid     = nid;
         pcfg.pcfg_id      = ip;
         pcfg.pcfg_misc    = port;
-        /* only passing one buffer size! */
-        pcfg.pcfg_size    = MAX (g_socket_rxmem, g_socket_txmem);
-        pcfg.pcfg_flags   = (g_socket_nonagle ? 0x01 : 0) |
-                            (irq_affinity     ? 0x02 : 0) |
-                            (share            ? 0x04 : 0) |
-                            (eager            ? 0x08 : 0);
 
         rc = pcfg_ioctl (&pcfg);
         if (rc != 0) {
-                fprintf (stderr, "failed to enable autoconnect: %s\n",
+                fprintf (stderr, "failed to add peer: %s\n",
                          strerror (errno));
                 return -1;
         }
@@ -554,63 +742,63 @@ jt_ptl_add_autoconnect (int argc, char **argv)
 }
 
 int 
-jt_ptl_del_autoconnect (int argc, char **argv)
+jt_ptl_del_peer (int argc, char **argv)
 {
         struct portals_cfg       pcfg;
         ptl_nid_t                nid = PTL_NID_ANY;
-        __u32                    ip  = 0;
-        int                      share = 0;
-        int                      keep_conn = 0;
+        __u32                    ip = 0;
+        int                      single_share = 0;
+        int                      argidx;
         int                      rc;
 
-        if (argc > 4) {
-                fprintf (stderr, "usage: %s [nid] [ipaddr] [sk]\n",
-                         argv[0]);
-                return 0;
-        }
-
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return -1;
 
+        if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
+                if (argc > 4) {
+                        fprintf (stderr, "usage: %s [nid] [ipaddr] [single_share]\n",
+                                 argv[0]);
+                        return 0;
+                }
+        } else if (argc > 3) {
+                fprintf (stderr, "usage: %s [nid] [single_share]\n", argv[0]);
+                return 0;
+        }
+                
         if (argc > 1 &&
             ptl_parse_nid (&nid, argv[1]) != 0) {
                 fprintf (stderr, "Can't parse nid: %s\n", argv[1]);
                 return -1;
         }
 
-        if (argc > 2 &&
-            ptl_parse_ipaddr (&ip, argv[2]) != 0) {
-                fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]);
-                return -1;
+        argidx = 2;
+        if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
+                if (argc > argidx &&
+                    ptl_parse_ipaddr (&ip, argv[argidx]) != 0) {
+                        fprintf (stderr, "Can't parse ip addr: %s\n",
+                                 argv[argidx]);
+                        return -1;
+                }
+                argidx++;
         }
-
-        if (argc > 3) {
-                char *opts = argv[3];
-                
-                while (*opts != 0)
-                        switch (*opts++) {
-                        case 's':
-                                share = 1;
-                                break;
-                        case 'k':
-                                keep_conn = 1;
-                                break;
-                        default:
-                                fprintf (stderr, "Can't parse flags: %s\n", 
-                                         argv[3]);
-                                return -1;
-                        }
+        
+        if (argc > argidx) {
+                if (!strcmp (argv[argidx], "single_share")) {
+                        single_share = 1;
+                } else {
+                        fprintf (stderr, "Unrecognised arg %s'\n", argv[3]);
+                        return -1;
+                }
         }
 
-        PCFG_INIT(pcfg, NAL_CMD_DEL_AUTOCONN);
-        pcfg.pcfg_nid     = nid;
-        pcfg.pcfg_id      = ip;
-        pcfg.pcfg_flags   = (share     ? 1 : 0) |
-                           (keep_conn ? 2 : 0);
+        PCFG_INIT(pcfg, NAL_CMD_DEL_PEER);
+        pcfg.pcfg_nid = nid;
+        pcfg.pcfg_id = ip;
+        pcfg.pcfg_flags = single_share;
 
         rc = pcfg_ioctl (&pcfg);
         if (rc != 0) {
-                fprintf (stderr, "failed to remove autoconnect route: %s\n",
+                fprintf (stderr, "failed to remove peer: %s\n",
                          strerror (errno));
                 return -1;
         }
@@ -622,11 +810,11 @@ int
 jt_ptl_print_connections (int argc, char **argv)
 {
         struct portals_cfg       pcfg;
-        char                     buffer[64];
+        char                     buffer[2][64];
         int                      index;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return -1;
 
         for (index = 0;;index++) {
@@ -637,14 +825,23 @@ jt_ptl_print_connections (int argc, char **argv)
                 if (rc != 0)
                         break;
 
-                printf (LPX64"@%s:%d:%s\n",
-                        pcfg.pcfg_nid, 
-                        ptl_ipaddr_2_str (pcfg.pcfg_id, buffer),
-                        pcfg.pcfg_misc,
-                        (pcfg.pcfg_flags == SOCKNAL_CONN_ANY) ? "A" :
-                        (pcfg.pcfg_flags == SOCKNAL_CONN_CONTROL) ? "C" :
-                        (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_IN) ? "I" :
-                        (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_OUT) ? "O" : "?");
+                if (g_nal_is_compatible (NULL, SOCKNAL, 0))
+                        printf ("[%d]%s:"LPX64"@%s:%d:%s %d/%d %s\n",
+                                pcfg.pcfg_gw_nal,       /* scheduler */
+                                ptl_ipaddr_2_str (pcfg.pcfg_fd, buffer[0], 1), /* local IP addr */
+                                pcfg.pcfg_nid, 
+                                ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), /* remote IP addr */
+                                pcfg.pcfg_misc,         /* remote port */
+                                (pcfg.pcfg_flags == SOCKNAL_CONN_ANY) ? "A" :
+                                (pcfg.pcfg_flags == SOCKNAL_CONN_CONTROL) ? "C" :
+                                (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_IN) ? "I" :
+                                (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_OUT) ? "O" : "?",
+                                pcfg.pcfg_count,        /* tx buffer size */
+                                pcfg.pcfg_size,         /* rx buffer size */
+                                pcfg.pcfg_wait ? "nagle" : "nonagle");
+                else
+                        printf (LPX64"\n",
+                                pcfg.pcfg_nid);
         }
 
         if (index == 0)
@@ -654,23 +851,22 @@ jt_ptl_print_connections (int argc, char **argv)
 
 int jt_ptl_connect(int argc, char **argv)
 {
+#ifndef HAVE_CONNECT
+        /* no connect() support */
+        return -1;
+#else /* HAVE_CONNECT */
         struct portals_cfg pcfg;
         struct sockaddr_in srvaddr;
         struct sockaddr_in locaddr;
         __u32 ipaddr;
         char *flag;
         int fd, rc;
-        int nonagle = 0;
-        int rxmem = 0;
-        int txmem = 0;
-        int bind_irq = 0;
         int type = SOCKNAL_CONN_ANY;
         int port, rport;
         int o;
-        int olen;
 
         if (argc < 3) {
-                fprintf(stderr, "usage: %s ip port [xibctr]\n", argv[0]);
+                fprintf(stderr, "usage: %s ip port [type]\n", argv[0]);
                 return 0;
         }
 
@@ -692,10 +888,6 @@ int jt_ptl_connect(int argc, char **argv)
                 for (flag = argv[3]; *flag != 0; flag++)
                         switch (*flag)
                         {
-                        case 'i':
-                                bind_irq = 1;
-                                break;
-                                
                         case 'I':
                                 if (type != SOCKNAL_CONN_ANY) {
                                         fprintf(stderr, "Can't flag type twice\n");
@@ -726,8 +918,8 @@ int jt_ptl_connect(int argc, char **argv)
                                 return (-1);
                         }
 
-        memset(&locaddr, 0, sizeof(locaddr));
-        locaddr.sin_family = AF_INET;
+        memset(&locaddr, 0, sizeof(locaddr)); 
+        locaddr.sin_family = AF_INET; 
         locaddr.sin_addr.s_addr = INADDR_ANY;
 
         memset(&srvaddr, 0, sizeof(srvaddr));
@@ -735,6 +927,7 @@ int jt_ptl_connect(int argc, char **argv)
         srvaddr.sin_port = htons(port);
         srvaddr.sin_addr.s_addr = htonl(ipaddr);
 
+
         for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
                 fd = socket(PF_INET, SOCK_STREAM, 0); 
                 if ( fd < 0 ) { 
@@ -745,35 +938,6 @@ int jt_ptl_connect(int argc, char **argv)
                 o = 1;
                 rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 
                                 &o, sizeof(o));
-
-                if (g_socket_nonagle) {
-                        o = 1;
-                        rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o));
-                        if (rc != 0) {
-                                fprintf(stderr, "cannot disable nagle: %s\n",
-                                        strerror(errno));
-                                return (-1);
-                        }
-                }
-                
-                if (g_socket_rxmem != 0) {
-                        o = g_socket_rxmem;
-                        rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o));
-                        if (rc != 0) {
-                                fprintf(stderr, "cannot set receive buffer size: %s\n",
-                                        strerror(errno));
-                                return (-1);
-                        }
-                }
-                
-                if (g_socket_txmem != 0) {
-                        o = g_socket_txmem;
-                        rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o));
-                        if (rc != 0) {
-                                fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno));
-                                return (-1);
-                        }
-                }
                 
                 locaddr.sin_port = htons(rport);
                 rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); 
@@ -799,18 +963,8 @@ int jt_ptl_connect(int argc, char **argv)
                 return -1;
         }
 
-        olen = sizeof (txmem);
-        if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0)
-                fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno));
-        olen = sizeof (rxmem);
-        if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0)
-                fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno));
-        olen = sizeof (nonagle);
-        if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0)
-                fprintf (stderr, "Can't get nagle: %s\n", strerror (errno));
-
-        printf("Connected host: %s snd: %d rcv: %d nagle: %s type: %s\n", 
-               argv[1], txmem, rxmem, nonagle ? "Disabled" : "Enabled",
+        printf("Connected host: %s type: %s\n", 
+               argv[1],
                (type == SOCKNAL_CONN_ANY) ? "A" :
                (type == SOCKNAL_CONN_CONTROL) ? "C" :
                (type == SOCKNAL_CONN_BULK_IN) ? "I" :
@@ -819,7 +973,6 @@ int jt_ptl_connect(int argc, char **argv)
         PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD);
         pcfg.pcfg_nal = g_nal;
         pcfg.pcfg_fd = fd;
-        pcfg.pcfg_flags = bind_irq;
         pcfg.pcfg_misc = type;
         
         rc = pcfg_ioctl(&pcfg);
@@ -837,11 +990,12 @@ int jt_ptl_connect(int argc, char **argv)
                 fprintf(stderr, "close failed: %d\n", rc);
 
         return 0;
+#endif /* HAVE_CONNECT */
 }
 
 int jt_ptl_disconnect(int argc, char **argv)
 {
-        struct portals_cfg        pcfg;
+        struct portals_cfg       pcfg;
         ptl_nid_t                nid = PTL_NID_ANY;
         __u32                    ipaddr = 0;
         int                      rc;
@@ -851,7 +1005,7 @@ int jt_ptl_disconnect(int argc, char **argv)
                 return 0;
         }
 
-        if (!g_nal_is_compatible (NULL, SOCKNAL, 0))
+        if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return 0;
 
         if (argc >= 2 &&
@@ -860,7 +1014,8 @@ int jt_ptl_disconnect(int argc, char **argv)
                 return -1;
         }
 
-        if (argc >= 3 &&
+        if (g_nal_is_compatible (NULL, SOCKNAL, 0) &&
+            argc >= 3 &&
             ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) {
                 fprintf (stderr, "Can't parse ip addr %s\n", argv[2]);
                 return -1;
@@ -882,7 +1037,7 @@ int jt_ptl_disconnect(int argc, char **argv)
 
 int jt_ptl_push_connection (int argc, char **argv)
 {
-        struct portals_cfg        pcfg;
+        struct portals_cfg       pcfg;
         int                      rc;
         ptl_nid_t                nid = PTL_NID_ANY;
         __u32                    ipaddr = 0;
@@ -923,7 +1078,7 @@ int jt_ptl_push_connection (int argc, char **argv)
 int 
 jt_ptl_print_active_txs (int argc, char **argv)
 {
-        struct portals_cfg        pcfg;
+        struct portals_cfg       pcfg;
         int                      index;
         int                      rc;
 
@@ -1045,7 +1200,7 @@ int jt_ptl_mynid(int argc, char **argv)
         char *nidstr;
         struct portals_cfg pcfg;
         ptl_nid_t mynid;
-        
+
         if (argc > 2) {
                 fprintf(stderr, "usage: %s [NID]\n", argv[0]);
                 fprintf(stderr, "NID defaults to the primary IP address of the machine.\n");
@@ -1079,7 +1234,8 @@ int jt_ptl_mynid(int argc, char **argv)
                 fprintf(stderr, "setting my NID failed: %s\n",
                        strerror(errno));
         else
-                printf("registered my nid "LPX64" (%s)\n", mynid, hostname);
+                printf("registered my nid "LPX64" (%s)\n", 
+                       ptl_nid2u64(mynid), hostname);
         return 0;
 }
 
@@ -1131,61 +1287,6 @@ jt_ptl_fail_nid (int argc, char **argv)
 }
 
 int
-jt_ptl_rxmem (int argc, char **argv)
-{
-        int   size;
-        
-        if (argc > 1)
-        {
-                if (Parser_size (&size, argv[1]) != 0 || size < 0)
-                {
-                        fprintf (stderr, "Can't parse size %s\n", argv[1]);
-                        return (0);
-                }
-
-                g_socket_rxmem = size;
-        }
-        printf ("Socket rmem = %d\n", g_socket_rxmem);        
-        return (0);
-}
-
-int
-jt_ptl_txmem (int argc, char **argv)
-{
-        int   size;
-        
-        if (argc > 1)
-        {
-                if (Parser_size (&size, argv[1]) != 0 || size < 0)
-                {
-                        fprintf (stderr, "Can't parse size %s\n", argv[1]);
-                        return (0);
-                }
-                g_socket_txmem = size;
-        }
-        printf ("Socket txmem = %d\n", g_socket_txmem);
-        return (0);
-}
-
-int
-jt_ptl_nagle (int argc, char **argv)
-{
-        int enable;
-
-        if (argc > 1)
-        {
-                if (Parser_bool (&enable, argv[1]) != 0)
-                {
-                        fprintf (stderr, "Can't parse boolean %s\n", argv[1]);
-                        return (-1);
-                }
-                g_socket_nonagle = !enable;
-        }
-        printf ("Nagle %s\n", g_socket_nonagle ? "disabled" : "enabled");
-        return (0);
-}
-
-int
 jt_ptl_add_route (int argc, char **argv)
 {
         struct portals_cfg       pcfg;
@@ -1297,7 +1398,8 @@ jt_ptl_del_route (int argc, char **argv)
         rc = pcfg_ioctl(&pcfg);
         if (rc != 0) 
         {
-                fprintf (stderr, "NAL_CMD_DEL_ROUTE ("LPX64") failed: %s\n", nid, strerror (errno));
+                fprintf (stderr, "NAL_CMD_DEL_ROUTE ("LPX64") failed: %s\n", 
+                         ptl_nid2u64(nid), strerror (errno));
                 return (-1);
         }
         
@@ -1327,7 +1429,7 @@ jt_ptl_notify_router (int argc, char **argv)
                 return (-1);
         }
 
-        if (Parser_bool (&enable, argv[2]) != 0) {
+        if (ptl_parse_bool (&enable, argv[2]) != 0) {
                 fprintf (stderr, "Can't parse boolean %s\n", argv[2]);
                 return (-1);
         }
@@ -1359,7 +1461,7 @@ jt_ptl_notify_router (int argc, char **argv)
         if (rc != 0) 
         {
                 fprintf (stderr, "NAL_CMD_NOTIFY_ROUTER ("LPX64") failed: %s\n",
-                         nid, strerror (errno));
+                         ptl_nid2u64(nid), strerror (errno));
                 return (-1);
         }
         
@@ -1442,9 +1544,19 @@ lwt_snapshot(cycles_t *now, int *ncpu, int *totalsize,
                 return (-1);
         }
 
+        /* crappy overloads */
+        if (data.ioc_nid2 != sizeof(lwt_event_t) ||
+            data.ioc_nid3 != offsetof(lwt_event_t, lwte_where)) {
+                fprintf(stderr,"kernel/user LWT event mismatch %d(%d),%d(%d)\n",
+                        (int)data.ioc_nid2, sizeof(lwt_event_t),
+                        (int)data.ioc_nid3,
+                        (int)offsetof(lwt_event_t, lwte_where));
+                return (-1);
+        }
+
         LASSERT (data.ioc_count != 0);
         LASSERT (data.ioc_misc != 0);
-        
+
         if (now != NULL)
                 *now = data.ioc_nid;
 
@@ -1515,15 +1627,21 @@ lwt_put_string(char *ustr)
 static int
 lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e)
 {
-        char            whenstr[32];
+#ifndef __WORDSIZE
+# error "__WORDSIZE not defined"
+#elif __WORDSIZE == 32
+# define XFMT "%#010lx"
+#elif __WORDSIZE== 64
+# define XFMT "%#018lx"
+#else
+# error "Unexpected __WORDSIZE"
+#endif
         char           *where = lwt_get_string(e->lwte_where);
 
         if (where == NULL)
                 return (-1);
 
-        sprintf(whenstr, LPD64, e->lwte_when - t0);
-
-        fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n",
+        fprintf(f, XFMT" "XFMT" "XFMT" "XFMT": "XFMT" %2d %10.6f %10.2f %s\n",
                 e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4,
                 (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0),
                 (t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz,
@@ -1532,6 +1650,7 @@ lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t
         lwt_put_string(where);
 
         return (0);
+#undef XFMT
 }
 
 double
@@ -1557,13 +1676,14 @@ get_cycles_per_usec ()
 int
 jt_ptl_lwt(int argc, char **argv)
 {
+        const int       lwt_max_cpus = 32;
         int             ncpus;
         int             totalspace;
         int             nevents_per_cpu;
         lwt_event_t    *events;
-        lwt_event_t    *cpu_event[LWT_MAX_CPUS + 1];
-        lwt_event_t    *next_event[LWT_MAX_CPUS];
-        lwt_event_t    *first_event[LWT_MAX_CPUS];
+        lwt_event_t    *cpu_event[lwt_max_cpus + 1];
+        lwt_event_t    *next_event[lwt_max_cpus];
+        lwt_event_t    *first_event[lwt_max_cpus];
         int             cpu;
         lwt_event_t    *e;
         int             rc;
@@ -1574,6 +1694,7 @@ jt_ptl_lwt(int argc, char **argv)
         cycles_t        tnow;
         struct timeval  tvnow;
         int             printed_date = 0;
+        int             nlines = 0;
         FILE           *f = stdout;
 
         if (argc < 2 ||
@@ -1604,9 +1725,9 @@ jt_ptl_lwt(int argc, char **argv)
         if (lwt_snapshot(NULL, &ncpus, &totalspace, NULL, 0) != 0)
                 return (-1);
 
-        if (ncpus > LWT_MAX_CPUS) {
+        if (ncpus > lwt_max_cpus) {
                 fprintf(stderr, "Too many cpus: %d (%d)\n", 
-                        ncpus, LWT_MAX_CPUS);
+                        ncpus, lwt_max_cpus);
                 return (-1);
         }
 
@@ -1723,6 +1844,12 @@ jt_ptl_lwt(int argc, char **argv)
                         rc = lwt_print(f, t0, tlast, mhz, cpu, next_event[cpu]);
                         if (rc != 0)
                                 break;
+
+                        if (++nlines % 10000 == 0 && f != stdout) {
+                                /* show some activity... */
+                                printf(".");
+                                fflush (stdout);
+                        }
                 }
 
                 tlast = next_event[cpu]->lwte_when;
@@ -1736,8 +1863,10 @@ jt_ptl_lwt(int argc, char **argv)
                         next_event[cpu] = NULL;
         }
 
-        if (f != stdout)
+        if (f != stdout) {
+                printf("\n");
                 fclose(f);
+        }
 
         free(events);
         return (0);
index c65ecb2..03cfe77 100644 (file)
 
 command_t list[] = {
         {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"},
-        {"print_autoconns", jt_ptl_print_autoconnects, 0, "print autoconnect entries (no args)"},
-        {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ise])"},
-        {"del_autoconn", jt_ptl_del_autoconnect, 0, "delete autoconnect entry (args: [nid] [host] [ks])"},
+        {"print_interfaces", jt_ptl_print_interfaces, 0, "print interface entries (no args)"},
+        {"add_interface", jt_ptl_add_interface, 0, "add interface entry (args: ip [netmask])"},
+        {"del_interface", jt_ptl_del_interface, 0, "delete interface entries (args: [ip])"},
+        {"print_peers", jt_ptl_print_peers, 0, "print peer entries (no args)"},
+        {"add_peer", jt_ptl_add_peer, 0, "add peer entry (args: nid host port)"},
+        {"del_peer", jt_ptl_del_peer, 0, "delete peer entry (args: [nid] [host])"},
         {"print_conns", jt_ptl_print_connections, 0, "print connections (no args)"},
         {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: host port [iIOC])"},
         {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [nid] [host]"},
@@ -48,9 +51,6 @@ command_t list[] = {
         {"set_route", jt_ptl_notify_router, 0, 
          "enable/disable a route in the routing table (args: gatewayNID up/down [time]"},
         {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"},
-        {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"},
-        {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"},
-        {"nagle", jt_ptl_nagle, 0, "Enable/Disable Nagle (args: [on/off])"},
         {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
         {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"},
         {"help", Parser_help, 0, "help"},
index a73a521..6316290 100644 (file)
@@ -34,7 +34,7 @@ do {                                                    \
 
 #define CHECK_MEMBER_OFFSET(s,m)                \
 do {                                            \
-        CHECK_VALUE(offsetof(s, m));            \
+        CHECK_VALUE((int)offsetof(s, m));       \
 } while (0)
 
 #define CHECK_MEMBER_SIZEOF(s,m)                \
index abc403b..946aa0c 100644 (file)
@@ -12,7 +12,7 @@ ldlm_objs += $(LDLM)ldlm_flock.o
 ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
 ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o recov_thread.o
 ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
-ptlrpc_objs += lproc_ptlrpc.o
+ptlrpc_objs += pers.o lproc_ptlrpc.o
 
 ptlrpc-objs := $(ldlm_objs) $(ptlrpc_objs)
 default: all
index 8321e73..b82c5ce 100644 (file)
@@ -92,9 +92,9 @@ static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal
 
         spin_lock_init(&desc->bd_lock);
         init_waitqueue_head(&desc->bd_waitq);
-        desc->bd_max_pages = npages;
-        desc->bd_page_count = 0;
-        desc->bd_md_h = PTL_HANDLE_NONE;
+        desc->bd_max_iov = npages;
+        desc->bd_iov_count = 0;
+        desc->bd_md_h = PTL_INVALID_HANDLE;
         desc->bd_portal = portal;
         desc->bd_type = type;
         
@@ -152,27 +152,15 @@ struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp (struct ptlrpc_request *req,
 void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
                            struct page *page, int pageoffset, int len)
 {
-#ifdef __KERNEL__
-        ptl_kiov_t *kiov = &desc->bd_iov[desc->bd_page_count];
-#else
-        struct iovec *iov = &desc->bd_iov[desc->bd_page_count];
-#endif
-        LASSERT(desc->bd_page_count < desc->bd_max_pages);
+        LASSERT(desc->bd_iov_count < desc->bd_max_iov);
         LASSERT(page != NULL);
         LASSERT(pageoffset >= 0);
         LASSERT(len > 0);
         LASSERT(pageoffset + len <= PAGE_SIZE);
 
-#ifdef __KERNEL__
-        kiov->kiov_page   = page;
-        kiov->kiov_offset = pageoffset;
-        kiov->kiov_len    = len;
-#else
-        iov->iov_base = page->addr + pageoffset;
-        iov->iov_len  = len;
-#endif
-        desc->bd_page_count++;
         desc->bd_nob += len;
+
+        ptlrpc_add_bulk_page(desc, page, pageoffset, len);
 }
 
 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
@@ -180,7 +168,7 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
         ENTRY;
 
         LASSERT(desc != NULL);
-        LASSERT(desc->bd_page_count != LI_POISON); /* not freed already */
+        LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
         LASSERT(!desc->bd_network_rw);         /* network hands off or */
         LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
         if (desc->bd_export)
@@ -188,8 +176,8 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
         else
                 class_import_put(desc->bd_import);
 
-        OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
-                                bd_iov[desc->bd_max_pages]));
+        OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, 
+                                bd_iov[desc->bd_max_iov]));
         EXIT;
 }
 
@@ -535,6 +523,7 @@ static int after_reply(struct ptlrpc_request *req)
 
 static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 {
+        char                   str[PTL_NALFMT_SIZE];
         struct obd_import     *imp;
         unsigned long          flags;
         int rc;
@@ -579,11 +568,11 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 
         req->rq_reqmsg->status = current->pid;
         CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:ni:nid:opc"
-               " %s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
+               " %s:%s:%d:"LPU64":%s:%s:%d\n", current->comm,
                imp->imp_obd->obd_uuid.uuid, req->rq_reqmsg->status,
                req->rq_xid,
                imp->imp_connection->c_peer.peer_ni->pni_name,
-               imp->imp_connection->c_peer.peer_nid,
+               ptlrpc_peernid2str(&imp->imp_connection->c_peer, str),
                req->rq_reqmsg->opc);
 
         rc = ptl_send_rpc(req);
@@ -597,6 +586,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 
 int ptlrpc_check_set(struct ptlrpc_request_set *set)
 {
+        char str[PTL_NALFMT_SIZE];
         unsigned long flags;
         struct list_head *tmp;
         int force_timer_recalc = 0;
@@ -797,11 +787,11 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                 }
 
                 CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:ni:nid:"
-                       "opc %s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
+                       "opc %s:%s:%d:"LPU64":%s:%s:%d\n", current->comm,
                        imp->imp_obd->obd_uuid.uuid, req->rq_reqmsg->status,
                        req->rq_xid,
                        imp->imp_connection->c_peer.peer_ni->pni_name,
-                       imp->imp_connection->c_peer.peer_nid,
+                       ptlrpc_peernid2str(&imp->imp_connection->c_peer, str),
                        req->rq_reqmsg->opc);
 
                 set->set_remaining--;
@@ -1123,13 +1113,10 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request)
         if (!ptlrpc_client_receiving_reply(request))
                 return;
 
-        rc = PtlMDUnlink (request->rq_reply_md_h);
-        if (rc == PTL_INV_MD) {
-                LASSERT (!ptlrpc_client_receiving_reply(request));
-                return;
-        }
-        
-        LASSERT (rc == PTL_OK);
+        PtlMDUnlink (request->rq_reply_md_h);
+
+        /* We have to l_wait_event() whatever the result, to give liblustre
+         * a chance to run reply_in_callback() */
 
         if (request->rq_set != NULL)
                 wq = &request->rq_set->set_waitq;
@@ -1320,6 +1307,7 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
 
 int ptlrpc_queue_wait(struct ptlrpc_request *req)
 {
+        char str[PTL_NALFMT_SIZE];
         int rc = 0;
         int brc;
         struct l_wait_info lwi;
@@ -1336,11 +1324,11 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
         req->rq_reqmsg->status = current->pid;
         LASSERT(imp->imp_obd != NULL);
         CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:ni:nid:opc "
-               "%s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
+               "%s:%s:%d:"LPU64":%s:%s:%d\n", current->comm,
                imp->imp_obd->obd_uuid.uuid,
                req->rq_reqmsg->status, req->rq_xid,
                imp->imp_connection->c_peer.peer_ni->pni_name,
-               imp->imp_connection->c_peer.peer_nid,
+               ptlrpc_peernid2str(&imp->imp_connection->c_peer, str),
                req->rq_reqmsg->opc);
 
         /* Mark phase here for a little debug help */
@@ -1423,11 +1411,11 @@ restart:
         DEBUG_REQ(D_NET, req, "-- done sleeping");
 
         CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:ni:nid:opc "
-               "%s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
+               "%s:%s:%d:"LPU64":%s:%s:%d\n", current->comm,
                imp->imp_obd->obd_uuid.uuid,
                req->rq_reqmsg->status, req->rq_xid,
                imp->imp_connection->c_peer.peer_ni->pni_name,
-               imp->imp_connection->c_peer.peer_nid,
+               ptlrpc_peernid2str(&imp->imp_connection->c_peer, str),
                req->rq_reqmsg->opc);
 
         spin_lock_irqsave(&imp->imp_lock, flags);
index c6a4163..c2c5288 100644 (file)
@@ -37,15 +37,17 @@ static struct list_head conn_unused_list;
 
 void ptlrpc_dump_connections(void)
 {
+        char str[PTL_NALFMT_SIZE];
         struct list_head *tmp;
         struct ptlrpc_connection *c;
         ENTRY;
 
         list_for_each(tmp, &conn_list) {
                 c = list_entry(tmp, struct ptlrpc_connection, c_link);
-                CERROR("Connection %p/%s has refcount %d (nid="LPX64" on %s)\n",
+                CERROR("Connection %p/%s has refcount %d (nid=%s on %s)\n",
                        c, c->c_remote_uuid.uuid, atomic_read(&c->c_refcount),
-                       c->c_peer.peer_nid, c->c_peer.peer_ni->pni_name);
+                       ptlrpc_peernid2str(&c->c_peer, str),
+                       c->c_peer.peer_ni->pni_name);
         }
         EXIT;
 }
@@ -53,18 +55,19 @@ void ptlrpc_dump_connections(void)
 struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
                                                 struct obd_uuid *uuid)
 {
+        char str[PTL_NALFMT_SIZE];
         struct list_head *tmp, *pos;
         struct ptlrpc_connection *c;
         ENTRY;
 
 
-        CDEBUG(D_INFO, "peer is "LPX64" on %s\n",
-               peer->peer_nid, peer->peer_ni->pni_name);
+        CDEBUG(D_INFO, "peer is %s on %s\n",
+               ptlrpc_id2str(peer, str), peer->peer_ni->pni_name);
 
         spin_lock(&conn_lock);
         list_for_each(tmp, &conn_list) {
                 c = list_entry(tmp, struct ptlrpc_connection, c_link);
-                if (peer->peer_nid == c->c_peer.peer_nid &&
+                if (memcmp(peer, &c->c_peer, sizeof(*peer)) == 0 &&
                     peer->peer_ni == c->c_peer.peer_ni) {
                         ptlrpc_connection_addref(c);
                         GOTO(out, c);
@@ -73,7 +76,7 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
 
         list_for_each_safe(tmp, pos, &conn_unused_list) {
                 c = list_entry(tmp, struct ptlrpc_connection, c_link);
-                if (peer->peer_nid == c->c_peer.peer_nid &&
+                if (memcmp(peer, &c->c_peer, sizeof(*peer)) == 0 &&
                     peer->peer_ni == c->c_peer.peer_ni) {
                         ptlrpc_connection_addref(c);
                         list_del(&c->c_link);
@@ -106,6 +109,7 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
 
 int ptlrpc_put_connection(struct ptlrpc_connection *c)
 {
+        char str[PTL_NALFMT_SIZE];
         int rc = 0;
         ENTRY;
 
@@ -114,8 +118,9 @@ int ptlrpc_put_connection(struct ptlrpc_connection *c)
                 RETURN(0);
         }
 
-        CDEBUG (D_INFO, "connection=%p refcount %d to "LPX64" on %s\n",
-                c, atomic_read(&c->c_refcount) - 1, c->c_peer.peer_nid,
+        CDEBUG (D_INFO, "connection=%p refcount %d to %s on %s\n",
+                c, atomic_read(&c->c_refcount) - 1, 
+                ptlrpc_peernid2str(&c->c_peer, str),
                 c->c_peer.peer_ni->pni_name);
 
         if (atomic_dec_and_test(&c->c_refcount)) {
@@ -134,10 +139,12 @@ int ptlrpc_put_connection(struct ptlrpc_connection *c)
 
 struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *c)
 {
+        char str[PTL_NALFMT_SIZE];
         ENTRY;
         atomic_inc(&c->c_refcount);
-        CDEBUG (D_INFO, "connection=%p refcount %d to "LPX64" on %s\n",
-                c, atomic_read(&c->c_refcount), c->c_peer.peer_nid,
+        CDEBUG (D_INFO, "connection=%p refcount %d to %s on %s\n",
+                c, atomic_read(&c->c_refcount),
+                ptlrpc_peernid2str(&c->c_peer, str),
                 c->c_peer.peer_ni->pni_name);
         RETURN(c);
 }
@@ -151,6 +158,7 @@ void ptlrpc_init_connection(void)
 
 void ptlrpc_cleanup_connection(void)
 {
+        char str[PTL_NALFMT_SIZE];
         struct list_head *tmp, *pos;
         struct ptlrpc_connection *c;
 
@@ -162,9 +170,10 @@ void ptlrpc_cleanup_connection(void)
         }
         list_for_each_safe(tmp, pos, &conn_list) {
                 c = list_entry(tmp, struct ptlrpc_connection, c_link);
-                CERROR("Connection %p/%s has refcount %d (nid="LPX64" on %s)\n",
+                CERROR("Connection %p/%s has refcount %d (nid=%s on %s)\n",
                        c, c->c_remote_uuid.uuid, atomic_read(&c->c_refcount),
-                       c->c_peer.peer_nid, c->c_peer.peer_ni->pni_name);
+                       ptlrpc_peernid2str(&c->c_peer, str),
+                       c->c_peer.peer_ni->pni_name);
                 list_del(&c->c_link);
                 OBD_FREE(c, sizeof(*c));
         }
index aab86ea..a2e5bc2 100644 (file)
 #endif
 #include <linux/obd_class.h>
 #include <linux/lustre_net.h>
+#include "ptlrpc_internal.h"
 
-struct ptlrpc_ni  ptlrpc_interfaces[NAL_MAX_NR];
+#if !defined(__KERNEL__) && CRAY_PORTALS
+/* forward ref in events.c */
+static void cray_portals_callback(ptl_event_t *ev);
+#endif
+
+
+struct ptlrpc_ni  ptlrpc_interfaces[8];
 int               ptlrpc_ninterfaces;
 
 /*  
@@ -38,20 +45,20 @@ int               ptlrpc_ninterfaces;
  */
 void request_out_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_cb_id   *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
         struct ptlrpc_request *req = cbid->cbid_arg;
         unsigned long          flags;
         ENTRY;
 
-        LASSERT (ev->type == PTL_EVENT_SENT ||
+        LASSERT (ev->type == PTL_EVENT_SEND_END ||
                  ev->type == PTL_EVENT_UNLINK);
         LASSERT (ev->unlinked);
 
-        DEBUG_REQ((ev->status == PTL_OK) ? D_NET : D_ERROR, req,
-                  "type %d, status %d", ev->type, ev->status);
+        DEBUG_REQ((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR, req,
+                  "type %d, status %d", ev->type, ev->ni_fail_type);
 
         if (ev->type == PTL_EVENT_UNLINK ||
-            ev->status != PTL_OK) {
+            ev->ni_fail_type != PTL_NI_OK) {
 
                 /* Failed send: make it seem like the reply timed out, just
                  * like failing sends in client.c does currently...  */
@@ -73,28 +80,28 @@ void request_out_callback(ptl_event_t *ev)
  */
 void reply_in_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_cb_id   *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
         struct ptlrpc_request *req = cbid->cbid_arg;
         unsigned long flags;
         ENTRY;
 
-        LASSERT (ev->type == PTL_EVENT_PUT ||
+        LASSERT (ev->type == PTL_EVENT_PUT_END ||
                  ev->type == PTL_EVENT_UNLINK);
         LASSERT (ev->unlinked);
-        LASSERT (ev->mem_desc.start == req->rq_repmsg);
+        LASSERT (ev->md.start == req->rq_repmsg);
         LASSERT (ev->offset == 0);
         LASSERT (ev->mlength <= req->rq_replen);
         
-        DEBUG_REQ((ev->status == PTL_OK) ? D_NET : D_ERROR, req,
-                  "type %d, status %d", ev->type, ev->status);
+        DEBUG_REQ((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR, req,
+                  "type %d, status %d", ev->type, ev->ni_fail_type);
 
         spin_lock_irqsave (&req->rq_lock, flags);
 
         LASSERT (req->rq_receiving_reply);
         req->rq_receiving_reply = 0;
 
-        if (ev->type == PTL_EVENT_PUT &&
-            ev->status == PTL_OK) {
+        if (ev->type == PTL_EVENT_PUT_END &&
+            ev->ni_fail_type == PTL_NI_OK) {
                 req->rq_replied = 1;
                 req->rq_nob_received = ev->mlength;
         }
@@ -112,21 +119,21 @@ void reply_in_callback(ptl_event_t *ev)
  */
 void client_bulk_callback (ptl_event_t *ev)
 {
-        struct ptlrpc_cb_id     *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_cb_id     *cbid = ev->md.user_ptr;
         struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
         unsigned long            flags;
         ENTRY;
 
         LASSERT ((desc->bd_type == BULK_PUT_SINK && 
-                  ev->type == PTL_EVENT_PUT) ||
+                  ev->type == PTL_EVENT_PUT_END) ||
                  (desc->bd_type == BULK_GET_SOURCE &&
-                  ev->type == PTL_EVENT_GET) ||
+                  ev->type == PTL_EVENT_GET_END) ||
                  ev->type == PTL_EVENT_UNLINK);
         LASSERT (ev->unlinked);
 
-        CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR,
+        CDEBUG((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR,
                "event type %d, status %d, desc %p\n", 
-               ev->type, ev->status, desc);
+               ev->type, ev->ni_fail_type, desc);
 
         spin_lock_irqsave (&desc->bd_lock, flags);
 
@@ -134,7 +141,7 @@ void client_bulk_callback (ptl_event_t *ev)
         desc->bd_network_rw = 0;
 
         if (ev->type != PTL_EVENT_UNLINK &&
-            ev->status == PTL_OK) {
+            ev->ni_fail_type == PTL_NI_OK) {
                 desc->bd_success = 1;
                 desc->bd_nob_transferred = ev->mlength;
         }
@@ -152,23 +159,24 @@ void client_bulk_callback (ptl_event_t *ev)
  */
 void request_in_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_cb_id               *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_cb_id               *cbid = ev->md.user_ptr;
         struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
         struct ptlrpc_srv_ni              *srv_ni = rqbd->rqbd_srv_ni;
         struct ptlrpc_service             *service = srv_ni->sni_service;
         struct ptlrpc_request             *req;
+        char                              str[PTL_NALFMT_SIZE];
         unsigned long                     flags;
         ENTRY;
 
-        LASSERT (ev->type == PTL_EVENT_PUT ||
+        LASSERT (ev->type == PTL_EVENT_PUT_END ||
                  ev->type == PTL_EVENT_UNLINK);
-        LASSERT ((char *)ev->mem_desc.start >= rqbd->rqbd_buffer);
-        LASSERT ((char *)ev->mem_desc.start + ev->offset + ev->mlength <=
+        LASSERT ((char *)ev->md.start >= rqbd->rqbd_buffer);
+        LASSERT ((char *)ev->md.start + ev->offset + ev->mlength <=
                  rqbd->rqbd_buffer + service->srv_buf_size);
 
-        CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR,
+        CDEBUG((ev->ni_fail_type == PTL_OK) ? D_NET : D_ERROR,
                "event type %d, status %d, service %s\n", 
-               ev->type, ev->status, service->srv_name);
+               ev->type, ev->ni_fail_type, service->srv_name);
 
         if (ev->unlinked) {
                 /* If this is the last request message to fit in the
@@ -179,16 +187,18 @@ void request_in_callback(ptl_event_t *ev)
                 req = &rqbd->rqbd_req;
                 memset(req, 0, sizeof (*req));
         } else {
-                LASSERT (ev->type == PTL_EVENT_PUT);
-                if (ev->status != PTL_OK) {
+                LASSERT (ev->type == PTL_EVENT_PUT_END);
+                if (ev->ni_fail_type != PTL_NI_OK) {
                         /* We moaned above already... */
                         return;
                 }
                 OBD_ALLOC_GFP(req, sizeof(*req), GFP_ATOMIC);
                 if (req == NULL) {
                         CERROR("Can't allocate incoming request descriptor: "
-                               "Dropping %s RPC from "LPX64"\n",
-                               service->srv_name, ev->initiator.nid);
+                               "Dropping %s RPC from %s\n",
+                               service->srv_name, 
+                               portals_id2str(srv_ni->sni_ni->pni_number,
+                                              ev->initiator, str));
                         return;
                 }
         }
@@ -197,15 +207,16 @@ void request_in_callback(ptl_event_t *ev)
          * flags are reset and scalars are zero.  We only set the message
          * size to non-zero if this was a successful receive. */
         req->rq_xid = ev->match_bits;
-        req->rq_reqmsg = ev->mem_desc.start + ev->offset;
-        if (ev->type == PTL_EVENT_PUT &&
-            ev->status == PTL_OK)
+        req->rq_reqmsg = ev->md.start + ev->offset;
+        if (ev->type == PTL_EVENT_PUT_END &&
+            ev->ni_fail_type == PTL_NI_OK)
                 req->rq_reqlen = ev->mlength;
-        req->rq_arrival_time = ev->arrival_time;
-        req->rq_peer.peer_nid = ev->initiator.nid;
+        do_gettimeofday(&req->rq_arrival_time);
+        req->rq_peer.peer_id = ev->initiator;
         req->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni;
+        ptlrpc_id2str(&req->rq_peer, req->rq_peerstr);
         req->rq_rqbd = rqbd;
-
+        
         spin_lock_irqsave (&service->srv_lock, flags);
 
         if (ev->unlinked) {
@@ -242,14 +253,14 @@ void request_in_callback(ptl_event_t *ev)
  */
 void reply_out_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_cb_id       *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_cb_id       *cbid = ev->md.user_ptr;
         struct ptlrpc_reply_state *rs = cbid->cbid_arg;
         struct ptlrpc_srv_ni      *sni = rs->rs_srv_ni;
         struct ptlrpc_service     *svc = sni->sni_service;
         unsigned long              flags;
         ENTRY;
 
-        LASSERT (ev->type == PTL_EVENT_SENT ||
+        LASSERT (ev->type == PTL_EVENT_SEND_END ||
                  ev->type == PTL_EVENT_ACK ||
                  ev->type == PTL_EVENT_UNLINK);
 
@@ -280,27 +291,27 @@ void reply_out_callback(ptl_event_t *ev)
  */
 void server_bulk_callback (ptl_event_t *ev)
 {
-        struct ptlrpc_cb_id     *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_cb_id     *cbid = ev->md.user_ptr;
         struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
         unsigned long            flags;
         ENTRY;
 
-        LASSERT (ev->type == PTL_EVENT_SENT ||
+        LASSERT (ev->type == PTL_EVENT_SEND_END ||
                  ev->type == PTL_EVENT_UNLINK ||
                  (desc->bd_type == BULK_PUT_SOURCE &&
                   ev->type == PTL_EVENT_ACK) ||
                  (desc->bd_type == BULK_GET_SINK &&
-                  ev->type == PTL_EVENT_REPLY));
+                  ev->type == PTL_EVENT_REPLY_END));
 
-        CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR,
+        CDEBUG((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR,
                "event type %d, status %d, desc %p\n", 
-               ev->type, ev->status, desc);
+               ev->type, ev->ni_fail_type, desc);
 
         spin_lock_irqsave (&desc->bd_lock, flags);
         
         if ((ev->type == PTL_EVENT_ACK ||
-             ev->type == PTL_EVENT_REPLY) &&
-            ev->status == PTL_OK) {
+             ev->type == PTL_EVENT_REPLY_END) &&
+            ev->ni_fail_type == PTL_NI_OK) {
                 /* We heard back from the peer, so even if we get this
                  * before the SENT event (oh yes we can), we know we
                  * read/wrote the peer buffer and how much... */
@@ -318,9 +329,9 @@ void server_bulk_callback (ptl_event_t *ev)
         EXIT;
 }
 
-static int ptlrpc_master_callback(ptl_event_t *ev)
+static void ptlrpc_master_callback(ptl_event_t *ev)
 {
-        struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr;
+        struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
         void (*callback)(ptl_event_t *ev) = cbid->cbid_fn;
 
         /* Honestly, it's best to find out early. */
@@ -333,32 +344,33 @@ static int ptlrpc_master_callback(ptl_event_t *ev)
                  callback == server_bulk_callback);
         
         callback (ev);
-        return (0);
 }
 
 int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer)
 {
         struct ptlrpc_ni   *pni;
-        struct lustre_peer  lpeer;
+        __u32               peer_nal;
+        ptl_nid_t           peer_nid;
         int                 i;
-        int                 rc = lustre_uuid_to_peer (uuid->uuid, &lpeer);
-
+        char                str[PTL_NALFMT_SIZE];
+        int                 rc = lustre_uuid_to_peer(uuid->uuid, 
+                                                     &peer_nal, &peer_nid);
         if (rc != 0)
                 RETURN (rc);
 
         for (i = 0; i < ptlrpc_ninterfaces; i++) {
                 pni = &ptlrpc_interfaces[i];
 
-                if (!memcmp(&lpeer.peer_ni, &pni->pni_ni_h,
-                            sizeof (lpeer.peer_ni))) {
-                        peer->peer_nid = lpeer.peer_nid;
+                if (pni->pni_number == peer_nal) {
+                        peer->peer_id.nid = peer_nid;
+                        peer->peer_id.pid = LUSTRE_SRV_PTL_PID;
                         peer->peer_ni = pni;
                         return (0);
                 }
         }
 
-        CERROR("Can't find ptlrpc interface for "LPX64" ni handle %08lx."LPX64"\n",
-               lpeer.peer_nid, lpeer.peer_ni.nal_idx, lpeer.peer_ni.cookie);
+        CERROR("Can't find ptlrpc interface for NAL %d, NID %s\n",
+               peer_nal, portals_nid2str(peer_nal, peer_nid, str));
         return (-ENOENT);
 }
 
@@ -381,10 +393,10 @@ void ptlrpc_ni_fini(struct ptlrpc_ni *pni)
                         LBUG();
 
                 case PTL_OK:
-                        kportal_put_ni (pni->pni_number);
+                        PtlNIFini(pni->pni_ni_h);
                         return;
                         
-                case PTL_EQ_INUSE:
+                case PTL_EQ_IN_USE:
                         if (retries != 0)
                                 CWARN("Event queue for %s still busy\n",
                                       pni->pni_name);
@@ -399,33 +411,68 @@ void ptlrpc_ni_fini(struct ptlrpc_ni *pni)
         /* notreached */
 }
 
+ptl_pid_t ptl_get_pid(void)
+{
+        ptl_pid_t        pid;
+
+#ifndef  __KERNEL__
+        pid = getpid();
+#else
+        pid = LUSTRE_SRV_PTL_PID;
+#endif
+        return pid;
+}
+        
 int ptlrpc_ni_init(int number, char *name, struct ptlrpc_ni *pni)
 {
         int              rc;
-        ptl_handle_ni_t *nip = kportal_get_ni (number);
-
-        if (nip == NULL) {
-                CDEBUG (D_NET, "Network interface %s not loaded\n", name);
+        char             str[20];
+        ptl_handle_ni_t  nih;
+        ptl_pid_t        pid;
+        
+        pid = ptl_get_pid();
+        
+        /* We're not passing any limits yet... */
+        rc = PtlNIInit(number, pid, NULL, NULL, &nih);
+        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+                CDEBUG (D_NET, "Can't init network interface %s: %d\n", 
+                        name, rc);
                 return (-ENOENT);
         }
 
-        CDEBUG (D_NET, "init %d %s: nal_idx %ld\n", number, name, nip->nal_idx);
+        CDEBUG(D_NET, "My pid is: %x\n", ptl_get_pid());
+        
+        PtlSnprintHandle(str, sizeof(str), nih);
+        CDEBUG (D_NET, "init %d %s: %s\n", number, name, str);
 
         pni->pni_name = name;
         pni->pni_number = number;
-        pni->pni_ni_h = *nip;
+        pni->pni_ni_h = nih;
 
-        pni->pni_eq_h = PTL_HANDLE_NONE;
+        pni->pni_eq_h = PTL_INVALID_HANDLE;
 
+        /* CAVEAT EMPTOR: how we process portals events is _radically_
+         * different depending on... */
 #ifdef __KERNEL__
-        /* kernel: portals calls the callback when the event is added to the
-         * queue, so we don't care if we lose events */
+        /* kernel portals calls our master callback when events are added to
+         * the event queue.  In fact lustre never pulls events off this queue,
+         * so it's only sized for some debug history. */
         rc = PtlEQAlloc(pni->pni_ni_h, 1024, ptlrpc_master_callback,
                         &pni->pni_eq_h);
 #else
-        /* liblustre: no asynchronous callback and allocate a nice big event
-         * queue so we don't drop any events... */
-        rc = PtlEQAlloc(pni->pni_ni_h, 10240, NULL, &pni->pni_eq_h);
+        /* liblustre calls the master callback when it removes events from the
+         * event queue.  The event queue has to be big enough not to drop
+         * anything */
+# if CRAY_PORTALS
+        /* cray portals implements a non-standard callback to notify us there
+         * are buffered events even when the app is not doing a filesystem
+         * call. */
+        rc = PtlEQAlloc(pni->pni_ni_h, 10240, cray_portals_callback,
+                        &pni->pni_eq_h);
+# else
+        rc = PtlEQAlloc(pni->pni_ni_h, 10240, PTL_EQ_HANDLER_NONE,
+                        &pni->pni_eq_h);
+# endif
 #endif
         if (rc != PTL_OK)
                 GOTO (fail, rc = -ENOMEM);
@@ -473,19 +520,16 @@ liblustre_check_events (int timeout)
 {
         ptl_event_t ev;
         int         rc;
+        int         i;
         ENTRY;
 
-        if (timeout) {
-                rc = PtlEQWait_timeout(ptlrpc_interfaces[0].pni_eq_h, &ev, timeout);
-        } else {
-                rc = PtlEQGet (ptlrpc_interfaces[0].pni_eq_h, &ev);
-        }
+        rc = PtlEQPoll(&ptlrpc_interfaces[0].pni_eq_h, 1, timeout * 1000,
+                       &ev, &i);
         if (rc == PTL_EQ_EMPTY)
                 RETURN(0);
         
         LASSERT (rc == PTL_EQ_DROPPED || rc == PTL_OK);
         
-#ifndef __KERNEL__
         /* liblustre: no asynch callback so we can't affort to miss any
          * events... */
         if (rc == PTL_EQ_DROPPED) {
@@ -494,10 +538,11 @@ liblustre_check_events (int timeout)
         }
         
         ptlrpc_master_callback (&ev);
-#endif
         RETURN(1);
 }
 
+int liblustre_waiting = 0;
+
 int
 liblustre_wait_event (int timeout)
 {
@@ -505,40 +550,63 @@ liblustre_wait_event (int timeout)
         struct liblustre_wait_callback *llwc;
         int                             found_something = 0;
 
-        /* First check for any new events */
-        if (liblustre_check_events(0))
-                found_something = 1;
+        /* single threaded recursion check... */
+        liblustre_waiting = 1;
 
-        /* Now give all registered callbacks a bite at the cherry */
-        list_for_each(tmp, &liblustre_wait_callbacks) {
-                llwc = list_entry(tmp, struct liblustre_wait_callback, 
-                                  llwc_list);
-                
-                if (llwc->llwc_fn(llwc->llwc_arg))
+        for (;;) {
+                /* Deal with all pending events */
+                while (liblustre_check_events(0))
                         found_something = 1;
-        }
 
-        /* return to caller if something happened */
-        if (found_something)
-                return 1;
-        
-        /* block for an event, returning immediately on timeout */
-        if (!liblustre_check_events(timeout))
-                return 0;
-
-        /* an event occurred; let all registered callbacks progress... */
-        list_for_each(tmp, &liblustre_wait_callbacks) {
-                llwc = list_entry(tmp, struct liblustre_wait_callback, 
-                                  llwc_list);
+                /* Give all registered callbacks a bite at the cherry */
+                list_for_each(tmp, &liblustre_wait_callbacks) {
+                        llwc = list_entry(tmp, struct liblustre_wait_callback, 
+                                          llwc_list);
                 
-                if (llwc->llwc_fn(llwc->llwc_arg))
-                        found_something = 1;
+                        if (llwc->llwc_fn(llwc->llwc_arg))
+                                found_something = 1;
+                }
+
+                if (found_something || timeout == 0)
+                        break;
+
+                /* Nothing so far, but I'm allowed to block... */
+                found_something = liblustre_check_events(timeout);
+                if (!found_something)           /* still nothing */
+                        break;                  /* I timed out */
         }
 
-        /* ...and tell caller something happened */
-        return 1;
+        liblustre_waiting = 0;
+
+        return found_something;
+}
+
+#if CRAY_PORTALS
+static void cray_portals_callback(ptl_event_t *ev)
+{
+        /* We get a callback from the client Cray portals implementation
+         * whenever anyone calls PtlEQPoll(), and an event queue with a
+         * callback handler has outstanding events.
+         *
+         * If it's not liblustre calling PtlEQPoll(), this lets us know we
+         * have outstanding events which we handle with
+         * liblustre_wait_event().
+         *
+         * Otherwise, we're already eagerly consuming events and we'd
+         * handle events out of order if we recursed. */
+        if (!liblustre_waiting)
+                liblustre_wait_event(0);
 }
 #endif
+#endif /* __KERNEL__ */
+
+int ptlrpc_default_nal(void)
+{
+        if (ptlrpc_ninterfaces == 0)
+                return (-ENOENT);
+
+        return (ptlrpc_interfaces[0].pni_number);
+}
 
 int ptlrpc_init_portals(void)
 {
@@ -548,11 +616,17 @@ int ptlrpc_init_portals(void)
                 int   number;
                 char *name;
         } ptl_nis[] = {
-                {QSWNAL,  "qswnal"},
-                {SOCKNAL, "socknal"},
-                {GMNAL,   "gmnal"},
-                {IBNAL,   "ibnal"},
-                {TCPNAL,  "tcpnal"}};
+#if !CRAY_PORTALS
+                {QSWNAL,    "qswnal"},
+                {SOCKNAL,   "socknal"},
+                {GMNAL,     "gmnal"},
+                {OPENIBNAL, "openibnal"},
+                {IIBNAL,    "iibnal"},
+                {TCPNAL,    "tcpnal"},
+#else
+                {CRAY_KB_ERNAL, "cray_kb_ernal"},
+#endif
+        };
         int   rc;
         int   i;
 
index f6affa8..1171fb5 100644 (file)
@@ -23,7 +23,6 @@
 #define DEBUG_SUBSYSTEM S_RPC
 #ifndef __KERNEL__
 #include <liblustre.h>
-#include <portals/lib-types.h>
 #endif
 #include <linux/obd_support.h>
 #include <linux/lustre_net.h>
@@ -35,7 +34,6 @@ static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len,
                          ptl_ack_req_t ack, struct ptlrpc_cb_id *cbid,
                          struct ptlrpc_connection *conn, int portal, __u64 xid)
 {
-        ptl_process_id_t remote_id;
         int              rc;
         ptl_md_t         md;
         char str[PTL_NALFMT_SIZE];
@@ -43,22 +41,16 @@ static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len,
 
         LASSERT (portal != 0);
         LASSERT (conn != NULL);
-        CDEBUG (D_INFO, "conn=%p ni %s nid "LPX64" (%s) on %s\n",
+        CDEBUG (D_INFO, "conn=%p ni %s id %s on %s\n",
                 conn, conn->c_peer.peer_ni->pni_name,
-                conn->c_peer.peer_nid,
-                portals_nid2str(conn->c_peer.peer_ni->pni_number,
-                                conn->c_peer.peer_nid, str),
+                ptlrpc_id2str(&conn->c_peer, str),
                 conn->c_peer.peer_ni->pni_name);
-
-        remote_id.nid = conn->c_peer.peer_nid,
-        remote_id.pid = 0;
-
         md.start     = base;
         md.length    = len;
         md.threshold = (ack == PTL_ACK_REQ) ? 2 : 1;
-        md.options   = 0;
+        md.options   = PTLRPC_MD_OPTIONS;
         md.user_ptr  = cbid;
-        md.eventq    = conn->c_peer.peer_ni->pni_eq_h;
+        md.eq_handle = conn->c_peer.peer_ni->pni_eq_h;
 
         if (ack == PTL_ACK_REQ &&
             OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_ACK | OBD_FAIL_ONCE)) {
@@ -67,24 +59,26 @@ static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len,
                 obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
         }
 
-        rc = PtlMDBind (conn->c_peer.peer_ni->pni_ni_h, md, mdh);
+        rc = PtlMDBind (conn->c_peer.peer_ni->pni_ni_h, md, 
+                        PTL_UNLINK, mdh);
         if (rc != PTL_OK) {
                 CERROR ("PtlMDBind failed: %d\n", rc);
-                LASSERT (rc == PTL_NOSPACE);
+                LASSERT (rc == PTL_NO_SPACE);
                 RETURN (-ENOMEM);
         }
 
         CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n",
                len, portal, xid);
 
-        rc = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0);
+        rc = PtlPut (*mdh, ack, conn->c_peer.peer_id, portal, 0, xid, 0, 0);
         if (rc != PTL_OK) {
                 int rc2;
                 /* We're going to get an UNLINK event when I unlink below,
                  * which will complete just like any other failed send, so
                  * I fall through and return success here! */
-                CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n",
-                       remote_id.nid, portal, xid, rc);
+                CERROR("PtlPut(%s, %d, "LPD64") failed: %d\n",
+                       ptlrpc_id2str(&conn->c_peer, str),
+                       portal, xid, rc);
                 rc2 = PtlMDUnlink(*mdh);
                 LASSERTF(rc2 == PTL_OK, "rc2 = %d\n", rc2);
         }
@@ -97,9 +91,9 @@ int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc)
         int                 rc;
         int                 rc2;
         struct ptlrpc_peer *peer;
-        ptl_process_id_t    remote_id;
         ptl_md_t            md;
         __u64               xid;
+        char                str[PTL_NALFMT_SIZE];
         ENTRY;
 
         if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_PTLRPC_BULK_PUT_NET)) 
@@ -112,56 +106,50 @@ int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc)
         desc->bd_success = 0;
         peer = &desc->bd_export->exp_connection->c_peer;
 
-        md.start = &desc->bd_iov[0];
-        md.niov = desc->bd_page_count;
-        md.length = desc->bd_nob;
-        md.eventq = peer->peer_ni->pni_eq_h;
-        md.threshold = 2; /* SENT and ACK/REPLY */
-#ifdef __KERNEL__
-        md.options = PTL_MD_KIOV;
-#else
-        md.options = PTL_MD_IOV;
-#endif
         md.user_ptr = &desc->bd_cbid;
+        md.eq_handle = peer->peer_ni->pni_eq_h;
+        md.threshold = 2; /* SENT and ACK/REPLY */
+        md.options = PTLRPC_MD_OPTIONS;
+        ptlrpc_fill_bulk_md(&md, desc);
+
         LASSERT (desc->bd_cbid.cbid_fn == server_bulk_callback);
         LASSERT (desc->bd_cbid.cbid_arg == desc);
 
         /* NB total length may be 0 for a read past EOF, so we send a 0
          * length bulk, since the client expects a bulk event. */
 
-        rc = PtlMDBind(peer->peer_ni->pni_ni_h, md, &desc->bd_md_h);
+        rc = PtlMDBind(peer->peer_ni->pni_ni_h, md,
+                       PTL_UNLINK, &desc->bd_md_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMDBind failed: %d\n", rc);
-                LASSERT (rc == PTL_NOSPACE);
+                LASSERT (rc == PTL_NO_SPACE);
                 RETURN(-ENOMEM);
         }
 
         /* Client's bulk and reply matchbits are the same */
         xid = desc->bd_req->rq_xid;
-        remote_id.nid = peer->peer_nid;
-        remote_id.pid = 0;
-
         CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d on %s "
-               "nid "LPX64" pid %d xid "LPX64"\n", 
-               md.niov, md.length, desc->bd_portal, peer->peer_ni->pni_name,
-               remote_id.nid, remote_id.pid, xid);
+               "nid %s pid %d xid "LPX64"\n", desc->bd_iov_count,
+               desc->bd_nob, desc->bd_portal, peer->peer_ni->pni_name,
+               ptlrpc_id2str(peer, str), peer->peer_id.pid, xid);
 
         /* Network is about to get at the memory */
         desc->bd_network_rw = 1;
 
         if (desc->bd_type == BULK_PUT_SOURCE)
-                rc = PtlPut (desc->bd_md_h, PTL_ACK_REQ, remote_id,
+                rc = PtlPut (desc->bd_md_h, PTL_ACK_REQ, peer->peer_id,
                              desc->bd_portal, 0, xid, 0, 0);
         else
-                rc = PtlGet (desc->bd_md_h, remote_id,
+                rc = PtlGet (desc->bd_md_h, peer->peer_id,
                              desc->bd_portal, 0, xid, 0);
-        
+
         if (rc != PTL_OK) {
                 /* Can't send, so we unlink the MD bound above.  The UNLINK
                  * event this creates will signal completion with failure,
                  * so we return SUCCESS here! */
-                CERROR("Transfer("LPU64", %d, "LPX64") failed: %d\n",
-                       remote_id.nid, desc->bd_portal, xid, rc);
+                CERROR("Transfer(%s, %d, "LPX64") failed: %d\n",
+                       ptlrpc_id2str(peer, str),
+                       desc->bd_portal, xid, rc);
                 rc2 = PtlMDUnlink(desc->bd_md_h);
                 LASSERT (rc2 == PTL_OK);
         }
@@ -182,16 +170,11 @@ void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc)
                 return;                         /* never started */
         
         /* The unlink ensures the callback happens ASAP and is the last
-         * one.  If it fails, it must be because completion just
-         * happened. */
+         * one.  If it fails, it must be because completion just happened,
+         * but we must still l_wait_event() in this case, to give liblustre
+         * a chance to run server_bulk_callback()*/
 
-        rc = PtlMDUnlink (desc->bd_md_h);
-        if (rc == PTL_INV_MD) {
-                LASSERT(!ptlrpc_bulk_active(desc));
-                return;
-        }
-        
-        LASSERT (rc == PTL_OK);
+        PtlMDUnlink (desc->bd_md_h);
 
         for (;;) {
                 /* Network access will complete in finite time but the HUGE
@@ -213,7 +196,6 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req)
         struct ptlrpc_peer *peer;
         int rc;
         int rc2;
-        ptl_process_id_t source_id;
         ptl_handle_me_t  me_h;
         ptl_md_t         md;
         ENTRY;
@@ -224,7 +206,7 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req)
         /* NB no locking required until desc is on the network */
         LASSERT (desc->bd_nob > 0);
         LASSERT (!desc->bd_network_rw);
-        LASSERT (desc->bd_page_count <= PTLRPC_MAX_BRW_PAGES);
+        LASSERT (desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
         LASSERT (desc->bd_req != NULL);
         LASSERT (desc->bd_type == BULK_PUT_SINK ||
                  desc->bd_type == BULK_GET_SOURCE);
@@ -233,19 +215,14 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req)
 
         peer = &desc->bd_import->imp_connection->c_peer;
 
-        md.start = &desc->bd_iov[0];
-        md.niov = desc->bd_page_count;
-        md.length = desc->bd_nob;
-        md.eventq = peer->peer_ni->pni_eq_h;
-        md.threshold = 1;                       /* PUT or GET */
-        md.options = (desc->bd_type == BULK_GET_SOURCE) ? 
-                     PTL_MD_OP_GET : PTL_MD_OP_PUT;
-#ifdef __KERNEL__
-        md.options |= PTL_MD_KIOV;
-#else
-        md.options |= PTL_MD_IOV;
-#endif
         md.user_ptr = &desc->bd_cbid;
+        md.eq_handle = peer->peer_ni->pni_eq_h;
+        md.threshold = 1;                       /* PUT or GET */
+        md.options = PTLRPC_MD_OPTIONS | 
+                     ((desc->bd_type == BULK_GET_SOURCE) ? 
+                      PTL_MD_OP_GET : PTL_MD_OP_PUT);
+        ptlrpc_fill_bulk_md(&md, desc);
+
         LASSERT (desc->bd_cbid.cbid_fn == client_bulk_callback);
         LASSERT (desc->bd_cbid.cbid_arg == desc);
 
@@ -256,15 +233,12 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req)
         desc->bd_registered = 1;
         desc->bd_last_xid = req->rq_xid;
 
-        source_id.nid = desc->bd_import->imp_connection->c_peer.peer_nid;
-        source_id.pid = PTL_PID_ANY;
-
-        rc = PtlMEAttach(peer->peer_ni->pni_ni_h,
-                         desc->bd_portal, source_id, req->rq_xid, 0,
-                         PTL_UNLINK, PTL_INS_AFTER, &me_h);
+        rc = PtlMEAttach(peer->peer_ni->pni_ni_h, desc->bd_portal,
+                         desc->bd_import->imp_connection->c_peer.peer_id, 
+                         req->rq_xid, 0, PTL_UNLINK, PTL_INS_AFTER, &me_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMEAttach failed: %d\n", rc);
-                LASSERT (rc == PTL_NOSPACE);
+                LASSERT (rc == PTL_NO_SPACE);
                 RETURN (-ENOMEM);
         }
 
@@ -273,7 +247,7 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req)
         rc = PtlMDAttach(me_h, md, PTL_UNLINK, &desc->bd_md_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMDAttach failed: %d\n", rc);
-                LASSERT (rc == PTL_NOSPACE);
+                LASSERT (rc == PTL_NO_SPACE);
                 desc->bd_network_rw = 0;
                 rc2 = PtlMEUnlink (me_h);
                 LASSERT (rc2 == PTL_OK);
@@ -283,7 +257,7 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req)
         CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPX64", "
                "portal %u on %s\n",
                desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
-               md.niov, md.length,
+               desc->bd_iov_count, desc->bd_nob,
                req->rq_xid, desc->bd_portal, peer->peer_ni->pni_name);
         RETURN(0);
 }
@@ -305,17 +279,12 @@ void ptlrpc_unregister_bulk (struct ptlrpc_request *req)
         LASSERT (desc->bd_req == req);  /* bd_req NULL until registered */
 
         /* the unlink ensures the callback happens ASAP and is the last
-         * one.  If it fails, it must be because completion just
-         * happened. */
-
-        rc = PtlMDUnlink (desc->bd_md_h);
-        if (rc == PTL_INV_MD) {
-                LASSERT(!ptlrpc_bulk_active(desc));
-                return;
-        }
-
-        LASSERT (rc == PTL_OK);
+         * one.  If it fails, it must be because completion just happened,
+         * but we must still l_wait_event() in this case to give liblustre
+         * a chance to run client_bulk_callback() */
 
+        PtlMDUnlink (desc->bd_md_h);
+        
         if (req->rq_set != NULL)
                 wq = &req->rq_set->set_waitq;
         else
@@ -416,7 +385,6 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         int rc2;
         struct ptlrpc_connection *connection;
         unsigned long flags;
-        ptl_process_id_t source_id;
         ptl_handle_me_t  reply_me_h;
         ptl_md_t         reply_md;
         ENTRY;
@@ -438,10 +406,7 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         request->rq_reqmsg->handle = request->rq_import->imp_remote_handle;
         request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST;
         request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt;
-
-        source_id.nid = connection->c_peer.peer_nid;
-        source_id.pid = PTL_PID_ANY;
-
+                
         LASSERT (request->rq_replen != 0);
         if (request->rq_repmsg == NULL)
                 OBD_ALLOC(request->rq_repmsg, request->rq_replen);
@@ -450,11 +415,11 @@ int ptl_send_rpc(struct ptlrpc_request *request)
 
         rc = PtlMEAttach(connection->c_peer.peer_ni->pni_ni_h,
                          request->rq_reply_portal, /* XXX FIXME bug 249 */
-                         source_id, request->rq_xid, 0, PTL_UNLINK,
-                         PTL_INS_AFTER, &reply_me_h);
+                         connection->c_peer.peer_id, request->rq_xid, 0,
+                         PTL_UNLINK, PTL_INS_AFTER, &reply_me_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMEAttach failed: %d\n", rc);
-                LASSERT (rc == PTL_NOSPACE);
+                LASSERT (rc == PTL_NO_SPACE);
                 GOTO(cleanup_repmsg, rc = -ENOMEM);
         }
 
@@ -473,15 +438,15 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         reply_md.start     = request->rq_repmsg;
         reply_md.length    = request->rq_replen;
         reply_md.threshold = 1;
-        reply_md.options   = PTL_MD_OP_PUT;
+        reply_md.options   = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT;
         reply_md.user_ptr  = &request->rq_reply_cbid;
-        reply_md.eventq    = connection->c_peer.peer_ni->pni_eq_h;
+        reply_md.eq_handle = connection->c_peer.peer_ni->pni_eq_h;
 
         rc = PtlMDAttach(reply_me_h, reply_md, PTL_UNLINK, 
                          &request->rq_reply_md_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMDAttach failed: %d\n", rc);
-                LASSERT (rc == PTL_NOSPACE);
+                LASSERT (rc == PTL_NO_SPACE);
                 GOTO(cleanup_me, rc -ENOMEM);
         }
 
@@ -537,10 +502,8 @@ int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd)
         ptl_md_t                 md;
         ptl_handle_me_t          me_h;
 
-        CDEBUG(D_NET, "PtlMEAttach: portal %d on %s h %lx."LPX64"\n",
-               service->srv_req_portal, srv_ni->sni_ni->pni_name,
-               srv_ni->sni_ni->pni_ni_h.nal_idx,
-               srv_ni->sni_ni->pni_ni_h.cookie);
+        CDEBUG(D_NET, "PtlMEAttach: portal %d on %s\n",
+               service->srv_req_portal, srv_ni->sni_ni->pni_name);
 
         if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_PTLRPC_RQBD))
                 return (-ENOMEM);
@@ -555,20 +518,20 @@ int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd)
         LASSERT(rqbd->rqbd_refcount == 0);
         rqbd->rqbd_refcount = 1;
 
-        md.start      = rqbd->rqbd_buffer;
-        md.length     = service->srv_buf_size;
-        md.max_size   = service->srv_max_req_size;
-        md.threshold  = PTL_MD_THRESH_INF;
-        md.options    = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | PTL_MD_AUTO_UNLINK;
-        md.user_ptr   = &rqbd->rqbd_cbid;
-        md.eventq     = srv_ni->sni_ni->pni_eq_h;
+        md.start     = rqbd->rqbd_buffer;
+        md.length    = service->srv_buf_size;
+        md.max_size  = service->srv_max_req_size;
+        md.threshold = PTL_MD_THRESH_INF;
+        md.options   = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT | PTL_MD_MAX_SIZE;
+        md.user_ptr  = &rqbd->rqbd_cbid;
+        md.eq_handle = srv_ni->sni_ni->pni_eq_h;
         
         rc = PtlMDAttach(me_h, md, PTL_UNLINK, &rqbd->rqbd_md_h);
         if (rc == PTL_OK)
                 return (0);
 
         CERROR("PtlMDAttach failed: %d; \n", rc);
-        LASSERT (rc == PTL_NOSPACE);
+        LASSERT (rc == PTL_NO_SPACE);
         rc = PtlMEUnlink (me_h);
         LASSERT (rc == PTL_OK);
         rqbd->rqbd_refcount = 0;
index bcbf095..6f5d086 100644 (file)
@@ -35,7 +35,8 @@
 #include "ptlrpc_internal.h"
 
 #ifdef __KERNEL__
-#ifndef CRAY_PORTALS
+#if !CRAY_PORTALS
+
 void ptlrpc_fill_bulk_md (ptl_md_t *md, struct ptlrpc_bulk_desc *desc)
 {
         LASSERT (desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
@@ -57,11 +58,16 @@ void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
 
         desc->bd_iov_count++;
 }
-#else
+
+#else  /* CRAY_PORTALS */
+#ifdef PTL_MD_KIOV
+#error "Conflicting compilation directives"
+#endif
+
 void ptlrpc_fill_bulk_md (ptl_md_t *md, struct ptlrpc_bulk_desc *desc)
 {
         LASSERT (desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
-        LASSERT (!(md->options & (PTL_MD_IOVEC | PTL_MD_KIOV | PTL_MD_PHYS)));
+        LASSERT (!(md->options & (PTL_MD_IOVEC | PTL_MD_PHYS)));
         
         md->options |= (PTL_MD_IOVEC | PTL_MD_PHYS);
         md->start = &desc->bd_iov[0];
@@ -79,22 +85,24 @@ void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
 
         desc->bd_iov_count++;
 }
-#endif
 
+#endif /* CRAY_PORTALS */
 #else /* !__KERNEL__ */
+
 void ptlrpc_fill_bulk_md(ptl_md_t *md, struct ptlrpc_bulk_desc *desc)
 {
+#if CRAY_PORTALS
+        LASSERT (!(md->options & (PTL_MD_IOVEC | PTL_MD_PHYS)));
+        LASSERT (desc->bd_iov_count == 1);
+#else
         LASSERT (!(md->options & (PTL_MD_IOVEC | PTL_MD_KIOV | PTL_MD_PHYS)));
-
+#endif
         if (desc->bd_iov_count == 1) {
                 md->start = desc->bd_iov[0].iov_base;
                 md->length = desc->bd_iov[0].iov_len;
                 return;
         }
         
-#if CRAY_PORTALS
-        LBUG();
-#endif
         md->options |= PTL_MD_IOVEC;
         md->start = &desc->bd_iov[0];
         md->length = desc->bd_iov_count;
@@ -104,14 +112,12 @@ static int can_merge_iovs(ptl_md_iovec_t *existing, ptl_md_iovec_t *candidate)
 {
         if (existing->iov_base + existing->iov_len == candidate->iov_base) 
                 return 1;
-        /* XXX it's good to have an warning here, but user-level echo_client
-         * will hit this. reenable it when we fixed echo_client.
-         */
 #if 0
+        /* Enable this section to provide earlier evidence of fragmented bulk */
         CERROR("Can't merge iovs %p for %x, %p for %x\n",
                existing->iov_base, existing->iov_len,
                candidate->iov_base, candidate->iov_len);
-#endif        
+#endif
         return 0;
 }
 
@@ -129,4 +135,5 @@ void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
                 desc->bd_iov_count++;
         }
 }
-#endif
+
+#endif /* !__KERNEL__ */
index 1db774e..e49b5f9 100644 (file)
@@ -107,6 +107,11 @@ enum {
 
 int ptlrpc_expire_one_request(struct ptlrpc_request *req);
 
+/* pers.c */
+void ptlrpc_fill_bulk_md(ptl_md_t *md, struct ptlrpc_bulk_desc *desc);
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, 
+                          int pageoffset, int len);
+
 /* pinger.c */
 int ptlrpc_start_pinger(void);
 int ptlrpc_stop_pinger(void);
index f3caf6a..94eb45d 100644 (file)
@@ -48,17 +48,17 @@ ptlrpc_free_server_req (struct ptlrpc_request *req)
 
         OBD_FREE(req, sizeof(*req));
 }
-        
+
 static char *
 ptlrpc_alloc_request_buffer (int size)
 {
         char *ptr;
-        
+
         if (size > SVC_BUF_VMALLOC_THRESHOLD)
                 OBD_VMALLOC(ptr, size);
         else
                 OBD_ALLOC(ptr, size);
-        
+
         return (ptr);
 }
 
@@ -372,7 +372,7 @@ ptlrpc_server_free_request(struct ptlrpc_service *svc, struct ptlrpc_request *re
         ptlrpc_free_server_req(req);
 }
 
-static int 
+static int
 ptlrpc_server_handle_request (struct ptlrpc_service *svc)
 {
         struct ptlrpc_request *request;
@@ -419,17 +419,16 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
 #endif
         rc = lustre_unpack_msg (request->rq_reqmsg, request->rq_reqlen);
         if (rc != 0) {
-                CERROR ("error unpacking request: ptl %d from "LPX64
+                CERROR ("error unpacking request: ptl %d from %s"
                         " xid "LPU64"\n", svc->srv_req_portal,
-                       request->rq_peer.peer_nid, request->rq_xid);
+                        request->rq_peerstr, request->rq_xid);
                 goto out;
         }
 
         rc = -EINVAL;
         if (request->rq_reqmsg->type != PTL_RPC_MSG_REQUEST) {
-                CERROR("wrong packet type received (type=%u) from "
-                       LPX64"\n", request->rq_reqmsg->type,
-                       request->rq_peer.peer_nid);
+                CERROR("wrong packet type received (type=%u) from %s\n",
+                       request->rq_reqmsg->type, request->rq_peerstr);
                 goto out;
         }
 
@@ -439,9 +438,10 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
          * client's timeout is similar to mine, she'll be timing out this
          * REQ anyway (bug 1502) */
         if (timediff / 1000000 > (long)obd_timeout) {
-                CERROR("Dropping timed-out opc %d request from "LPX64
+                CERROR("Dropping timed-out opc %d request from %s"
                        ": %ld seconds old\n", request->rq_reqmsg->opc,
-                       request->rq_peer.peer_nid, timediff / 1000000);
+                       request->rq_peerstr,
+                       timediff / 1000000);
                 goto out;
         }
 
@@ -461,26 +461,27 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
         }
 
         CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:ni:nid:opc "
-               "%s:%s+%d:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
+               "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm,
                (request->rq_export ?
                 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
                (request->rq_export ?
                 atomic_read(&request->rq_export->exp_refcount) : -99),
                request->rq_reqmsg->status, request->rq_xid,
                request->rq_peer.peer_ni->pni_name,
-               request->rq_peer.peer_nid,
+               request->rq_peerstr,
                request->rq_reqmsg->opc);
 
         rc = svc->srv_handler(request);
+
         CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:ni:nid:opc "
-               "%s:%s+%d:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
+               "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm,
                (request->rq_export ?
                 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
                (request->rq_export ?
                 atomic_read(&request->rq_export->exp_refcount) : -99),
                request->rq_reqmsg->status, request->rq_xid,
                request->rq_peer.peer_ni->pni_name,
-               request->rq_peer.peer_nid,
+               request->rq_peerstr,
                request->rq_reqmsg->opc);
 
 put_conn:
@@ -493,9 +494,9 @@ put_conn:
         timediff = timeval_sub(&work_end, &work_start);
 
         CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
-               "request "LPU64" opc %u from NID "LPX64" processed in %ldus "
+               "request "LPU64" opc %u from %s processed in %ldus "
                "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc,
-               request->rq_peer.peer_nid,
+               request->rq_peerstr,
                timediff, timeval_sub(&work_end, &request->rq_arrival_time));
 
         if (svc->srv_stats != NULL) {
@@ -522,6 +523,7 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc)
         struct obd_device         *obd;
         int                        nlocks;
         int                        been_handled;
+        char                       str[PTL_NALFMT_SIZE];
         ENTRY;
 
         spin_lock_irqsave (&svc->srv_lock, flags);
@@ -566,10 +568,11 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc)
                 /* If we see this, we should already have seen the warning
                  * in mds_steal_ack_locks()  */
                 CWARN("All locks stolen from rs %p x"LPD64".t"LPD64
-                      " o%d NID"LPX64"\n",
+                      " o%d NID %s\n",
                       rs, 
                       rs->rs_xid, rs->rs_transno,
-                      rs->rs_msg.opc, exp->exp_connection->c_peer.peer_nid);
+                      rs->rs_msg.opc, 
+                      ptlrpc_peernid2str(&exp->exp_connection->c_peer, str));
         }
 
         if ((!been_handled && rs->rs_on_net) || 
@@ -662,7 +665,8 @@ static void
 ptlrpc_check_rqbd_pools(struct ptlrpc_service *svc)
 {
         struct ptlrpc_srv_ni  *sni;
-        int                    i, avail = 0;
+        int                    i;
+        int                    avail = 0;
         int                    low_water = svc->srv_nbuf_per_group/2;
 
         for (i = 0; i < ptlrpc_ninterfaces; i++) {
@@ -673,6 +677,7 @@ ptlrpc_check_rqbd_pools(struct ptlrpc_service *svc)
                 if (sni->sni_nrqbd_receiving <= low_water)
                         ptlrpc_grow_req_bufs(sni);
         }
+
         lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_AVAIL_CNTR, avail);
 }
 
@@ -897,7 +902,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
                                            rqbd_list);
 
                         rc = PtlMDUnlink(rqbd->rqbd_md_h);
-                        LASSERT (rc == PTL_OK || rc == PTL_INV_MD);
+                        LASSERT (rc == PTL_OK || rc == PTL_MD_INVALID);
                 }
 
                 /* Wait for the network to release any buffers it's
index eda5779..fb578fa 100644 (file)
@@ -269,6 +269,13 @@ class LustreDB_XML(LustreDB):
             ret.append((net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi))
         return ret
 
+    def get_hostaddr(self):
+        ret = []
+        list = self.dom_node.getElementsByTagName('hostaddr')
+        for node in list:
+            ret.append(node.firstChild.data)
+        return ret
+
     def _update_active(self, tgt, new):
         raise Lustre.LconfError("updates not implemented for XML")
 
index 0a1d5bc..aa7b1aa 100755 (executable)
@@ -250,21 +250,16 @@ class DaemonHandler:
             log(self.pidfile(), e)
             
 class AcceptorHandler(DaemonHandler):
-    def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
+    def __init__(self, port, net_type):
         DaemonHandler.__init__(self, "acceptor")
         self.port = port
         self.flags = ''
-        self.send_mem = send_mem
-        self.recv_mem = recv_mem
-
-        if irq_aff:
-            self.flags = self.flags + ' -i'
 
     def pidfile(self):
         return "/var/run/%s-%d.pid" % (self.command, self.port)
 
     def command_line(self):
-        return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
+        return string.join(map(str,(self.flags, self.port)))
     
 acceptors = {}
 
@@ -418,33 +413,51 @@ class LCTLInterface:
   quit """ % (net, nid)
         self.run(cmds)
 
+    # add an interface
+    def add_interface(self, net, ip, netmask = ""):
+        """ add an interface """
+        cmds = """
+  network %s
+  add_interface %s %s
+  quit """ % (net, ip, netmask)
+        self.run(cmds)
+
+    # delete an interface
+    def del_interface(self, net, ip):
+        """ delete an interface """
+        cmds = """
+  network %s
+  del_interface %s
+  quit """ % (net, ip)
+        self.run(cmds)
+
     # create a new connection
     def add_uuid(self, net_type, uuid, nid):
         cmds = "\n  add_uuid %s %s %s" %(uuid, nid, net_type)
         self.run(cmds)
 
-    def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
-                     port, flags):
+    def add_peer(self, net_type, nid, hostaddr, port):
         if net_type  in ('tcp',) and not config.lctl_dump:
             cmds =  """
   network %s
-  send_mem %d
-  recv_mem %d
-  add_autoconn %s %s %d %s
+  add_peer %s %s %d
   quit""" % (net_type,
-             send_mem,
-             recv_mem,
-             nid, hostaddr, port, flags )
+             nid, hostaddr, port )
+            self.run(cmds)
+        elif net_type in ('openib','iib',) and not config.lctl_dump:
+            cmds =  """
+  network %s
+  add_peer %s
+  quit""" % (net_type,
+             nid )
             self.run(cmds)
     
     def connect(self, srv):
         self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
-        if srv.net_type  in ('tcp',) and not config.lctl_dump:
-            flags = 'se'
-            if srv.irq_affinity:
-                flags = flags + 'i'
-            self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
-                 srv.nid, srv.hostaddr, srv.port, flags)
+        if srv.net_type  in ('tcp','openib','iib',) and not config.lctl_dump:
+            if srv.hostaddr[0]:
+                hostaddr = string.split(srv.hostaddr[0], '/')[0]
+            self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
 
     # Recover a device
     def recover(self, dev_name, new_conn):
@@ -499,21 +512,31 @@ class LCTLInterface:
         self.run(cmds)
 
 
-    def del_autoconn(self, net_type, nid, hostaddr):
+    def del_peer(self, net_type, nid, hostaddr):
         if net_type  in ('tcp',) and not config.lctl_dump:
                 cmds =  """
   ignore_errors
   network %s
-  del_autoconn %s %s s
+  del_peer %s %s single_share
   quit""" % (net_type,
              nid, hostaddr)
                 self.run(cmds)
+        elif net_type  in ('openib','iib',) and not config.lctl_dump:
+                cmds =  """
+  ignore_errors
+  network %s
+  del_peer %s single_share
+  quit""" % (net_type,
+             nid)
+                self.run(cmds)
         
     # disconnect one connection
     def disconnect(self, srv):
         self.del_uuid(srv.nid_uuid)
-        if srv.net_type  in ('tcp',) and not config.lctl_dump:
-            self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
+        if srv.net_type  in ('tcp','openib','iib',) and not config.lctl_dump:
+            if srv.hostaddr[0]:
+                hostaddr = string.split(srv.hostaddr[0], '/')[0]
+            self.del_peer(srv.net_type, srv.nid, hostaddr)
 
     def del_uuid(self, uuid):
         cmds =  """
@@ -928,7 +951,7 @@ def sys_get_local_nid(net_type, wildcard, cluster_id):
 def sys_get_local_address(net_type, wildcard, cluster_id):
     """Return the local address for the network type."""
     local = ""
-    if net_type in ('tcp',):
+    if net_type in ('tcp','openib','iib',):
         if  ':' in wildcard:
             iface, star = string.split(wildcard, ':')
             local = if2addr(iface)
@@ -1124,9 +1147,6 @@ class Network(Module):
         self.nid = self.db.get_val('nid', '*')
         self.cluster_id = self.db.get_val('clusterid', "0")
         self.port = self.db.get_val_int('port', 0)
-        self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
-        self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
-        self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
 
         if '*' in self.nid:
             self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
@@ -1139,14 +1159,17 @@ class Network(Module):
 
         self.nid_uuid = self.nid_to_uuid(self.nid)
 
-        self.hostaddr = self.db.get_val('hostaddr', self.nid)
-        if '*' in self.hostaddr:
-            self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
-            if not self.hostaddr:
-                panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
-            debug("hostaddr:", self.hostaddr)
-
-        self.add_portals_module("libcfs", 'portals')
+        self.hostaddr = self.db.get_hostaddr()
+        if len(self.hostaddr) == 0:
+            self.hostaddr.append(self.nid)
+        if '*' in self.hostaddr[0]:
+            self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
+            if not self.hostaddr[0]:
+                panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
+            debug("hostaddr:", self.hostaddr[0])
+
+        self.add_portals_module("libcfs", 'libcfs')
+        self.add_portals_module("portals", 'portals')
         if node_needs_router():
             self.add_portals_module("router", 'kptlrouter')
         if self.net_type == 'tcp':
@@ -1155,6 +1178,10 @@ class Network(Module):
             self.add_portals_module("knals/qswnal", 'kqswnal')
         if self.net_type == 'gm':
             self.add_portals_module("knals/gmnal", 'kgmnal')
+        if self.net_type == 'openib':
+            self.add_portals_module("knals/openibnal", 'kopenibnal')
+        if self.net_type == 'iib':
+            self.add_portals_module("knals/iibnal", 'kiibnal')
 
     def nid_to_uuid(self, nid):
         return "NID_%s_UUID" %(nid,)
@@ -1167,6 +1194,13 @@ class Network(Module):
             lctl.network(self.net_type, self.nid)
         if self.net_type == 'tcp':
             sys_tweak_socknal()
+            for hostaddr in self.db.get_hostaddr():
+                ip = string.split(hostaddr, '/')[0]
+                if len(string.split(hostaddr, '/')) == 2:
+                    netmask = string.split(hostaddr, '/')[1]
+                else:
+                    netmask = ""
+                lctl.add_interface(self.net_type, ip, netmask)
         if self.net_type == 'elan':
             sys_optimize_elan()
         if self.port and  node_is_router():
@@ -1209,6 +1243,10 @@ class Network(Module):
             stop_acceptor(self.port)
         if  node_is_router():
             self.disconnect_peer_gateways()
+        if self.net_type == 'tcp':
+            for hostaddr in self.db.get_hostaddr():
+                ip = string.split(hostaddr, '/')[0]
+                lctl.del_interface(self.net_type, ip)
 
 class RouteTable(Module):
     def __init__(self,db):
@@ -1216,9 +1254,9 @@ class RouteTable(Module):
 
     def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
                          lo, hi):
-        # only setup connections for tcp NALs
+        # only setup connections for tcp, openib, and iib NALs
         srvdb = None
-        if not net_type in ('tcp',):
+        if not net_type in ('tcp','openib','iib',):
             return None
 
         # connect to target if route is to single node and this node is the gw
@@ -2104,9 +2142,7 @@ def find_local_clusters(node_db):
         if srv.port > 0:
             if acceptors.has_key(srv.port):
                 panic("duplicate port:", srv.port)
-            acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
-                                                  srv.send_mem, srv.recv_mem,
-                                                  srv.irq_affinity)
+            acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
 
 # This node is a gateway.
 is_router = 0
index 6603aca..fad8fb6 100644 (file)
@@ -27,7 +27,6 @@
 
 #include <stdlib.h>
 #include <stdio.h>
-#include <portals/api-support.h>
 #include <portals/ptlctl.h>
 #include "obdctl.h"
 #include "parser.h"
@@ -68,14 +67,20 @@ command_t cmdlist[] = {
          "usage: --net <tcp/elan/myrinet> <command>"},
         {"network", jt_ptl_network, 0, "commands that follow apply to net\n"
          "usage: network <tcp/elan/myrinet>"},
-        {"autoconn_list", jt_ptl_print_autoconnects, 0, "print autoconnect entries\n"
-         "usage: print_autoconns"},
-        {"add_autoconn", jt_ptl_add_autoconnect, 0, "add an autoconnect entry\n"
-         "usage: add_autoconn <nid> <host> <port> [ise]"},
-        {"del_autoconn",jt_ptl_del_autoconnect,0,"remove an autoconnect entry\n"
+        {"interface_list", jt_ptl_print_interfaces,0,"print interface entries\n"
+         "usage: interface_list"},
+        {"add_interface", jt_ptl_add_interface, 0, "add interface entry\n"
+         "usage: add_interface ip [netmask]"},
+        {"del_interface", jt_ptl_del_interface, 0, "del interface entry\n"
+         "usage: del_interface [ip]"},
+        {"peer_list", jt_ptl_print_peers, 0, "print peer entries\n"
+         "usage: peer_list"},
+        {"add_peer", jt_ptl_add_peer, 0, "add an peer entry\n"
+         "usage: add_peer <nid> <host> <port>"},
+        {"del_peer", jt_ptl_del_peer, 0, "remove an peer entry\n"
          "usage: del_autoconn [<nid>] [<host>] [ks]"},
-        {"conn_list", jt_ptl_print_connections, 0, "connect to a remote nid\n"
-         "usage: print_conns"},
+        {"conn_list", jt_ptl_print_connections, 0, "print all the connected remote nid\n"
+         "usage: conn_list"},
         {"connect", jt_ptl_connect, 0, "connect to a remote nid\n"
          "usage: connect <host> <port> [iIOC]"},
         {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid\n"
@@ -91,7 +96,7 @@ command_t cmdlist[] = {
         {"add_uuid", jt_lcfg_add_uuid, 0, "associate a UUID with a nid\n"
          "usage: add_uuid <uuid> <nid> <net_type>"},
         {"close_uuid", jt_obd_close_uuid, 0, "disconnect a UUID\n"
-         "usage: close_uuid <uuid> <net-type>)"},
+         "usage: close_uuid <uuid> <net_type>"},
         {"del_uuid", jt_lcfg_del_uuid, 0, "delete a UUID association\n"
          "usage: del_uuid <uuid>"},
         {"add_route", jt_ptl_add_route, 0,
@@ -109,15 +114,6 @@ command_t cmdlist[] = {
         {"show_route", jt_ptl_print_routes, 0,
          "print the portals routing table, same as route_list\n"
          "usage: show_route"},
-        {"recv_mem", jt_ptl_rxmem, 0, "set socket receive buffer size, "
-         "if size is omited the current size is reported.\n"
-         "usage: recv_mem [size]"},
-        {"send_mem", jt_ptl_txmem, 0, "set socket send buffer size, "
-         "if size is omited the current size is reported.\n"
-         "usage: send_mem [size]"},
-        {"nagle", jt_ptl_nagle, 0, "enable/disable nagle, omitting the "
-         "argument will cause the current nagle setting to be reported.\n"
-         "usage: nagle [on/off]"},
         {"fail", jt_ptl_fail_nid, 0, "fail/restore communications.\n"
          "Omitting the count means indefinitely, 0 means restore, "
          "otherwise fail 'count' messages.\n"
@@ -286,7 +282,7 @@ command_t cmdlist[] = {
          "usage: mark <text>"},
         {"filter", jt_dbg_filter, 0, "filter message type\n"
          "usage: filter <subsystem id/debug mask>"},
-        {"show", jt_dbg_show, 0, "show type of messages\n"
+        {"show", jt_dbg_show, 0, "Show specific type of messages\n"
          "usage: show <subsystem id/debug mask>"},
         {"debug_list", jt_dbg_list, 0, "list subsystem and debug types\n"
          "usage: debug_list <subs/types>"},
index 53985a7..33d6839 100755 (executable)
@@ -74,8 +74,8 @@ Object creation command summary:
   --node node_name
   --nid nid
   --cluster_id 
-  --nettype tcp|elan|gm
-  --hostaddr addr
+  --nettype tcp|elan|gm|openib|iib
+  --hostaddr ip[/netmask]
   --port port
   --tcpbuf size
   --irq_affinity 0|1
@@ -108,7 +108,7 @@ Object creation command summary:
   --fstype extN|ext3
   --journal_size size
   --inode_size size
-  --obdtype obdecho|obdfilter
+  --osdtype obdecho|obdfilter
   --ostuuid uuid
  
 --add mtpt  - Mountpoint
@@ -119,8 +119,10 @@ Object creation command summary:
 
 --add route
   --node nodename
+  --router
   --gw nid
-  --tgt nid
+  --gateway_cluster_id nid
+  --target_cluster_id nid
   --lo nid
   --hi nid
 
@@ -133,6 +135,7 @@ Object creation command summary:
 """
 
 PARAM = Lustre.Options.PARAM
+PARAMLIST = Lustre.Options.PARAMLIST
 lmc_options = [
     # lmc input/output options
     ('reference', "Print short reference for commands."), 
@@ -155,12 +158,10 @@ lmc_options = [
     ('subsystem', "Specify which Lustre subsystems have debug output recorded in the log",  PARAM),
 
     # network 
-    ('nettype', "Specify the network type. This can be tcp/elan/gm.", PARAM),
+    ('nettype', "Specify the network type. This can be tcp/elan/gm/openib/iib.", PARAM),
     ('nid', "Give the network ID, e.g ElanID/IP Address as used by portals.", PARAM),
-    ('tcpbuf', "Optional argument to specify the TCP buffer size.", PARAM, "0"),
     ('port', "Optional argument to specify the TCP port number.", PARAM, DEFAULT_PORT),
-    ('irq_affinity', "Optional argument.", PARAM, 0),
-    ('hostaddr', "", PARAM,""),
+    ('hostaddr', "Optional argument to specify the host address.", PARAMLIST),
     ('cluster_id', "Specify the cluster ID", PARAM, "0"),
 
     # routes
@@ -317,21 +318,16 @@ class GenConfig:
         return new
 
     def network(self, name, uuid, nid, cluster_id, net, hostaddr="",
-                port=0, tcpbuf=0, irq_aff=0):
+                port=0):
         """create <network> node"""
         network = self.newService("network", name, uuid)
         network.setAttribute("nettype", net);
         self.addElement(network, "nid", nid)
         self.addElement(network, "clusterid", cluster_id)
-        if hostaddr:
-            self.addElement(network, "hostaddr", hostaddr)
+        for host in  hostaddr:
+            self.addElement(network, "hostaddr", host)
         if port:
             self.addElement(network, "port", "%d" %(port))
-        if tcpbuf:
-            self.addElement(network, "sendmem", "%d" %(tcpbuf))
-            self.addElement(network, "recvmem", "%d" %(tcpbuf))
-        if irq_aff:
-            self.addElement(network, "irqaffinity", "%d" %(irq_aff))
             
         return network
 
@@ -637,12 +633,8 @@ def add_net(gen, lustre, options):
 
     if net_type in ('tcp',):
         port = get_option_int(options, 'port')
-        tcpbuf = get_option_int(options, 'tcpbuf')
-        irq_aff = get_option_int(options, 'irq_affinity')
-    elif net_type in ('elan', 'gm'):
+    elif net_type in ('elan', 'gm', 'openib','iib'):
         port = 0
-        tcpbuf = 0
-        irq_aff = 0
     else:
         print "Unknown net_type: ", net_type
         sys.exit(2)
@@ -657,7 +649,7 @@ def add_net(gen, lustre, options):
     net_name = new_name('NET_'+ node_name +'_'+ net_type)
     net_uuid = new_uuid(net_name)
     node.appendChild(gen.network(net_name, net_uuid, nid, cluster_id, net_type,
-                                 hostaddr, port, tcpbuf, irq_aff))
+                                 hostaddr, port))
     node_add_profile(gen, node, "network", net_uuid)
 
 
index 8fb5d2e..49f4bbe 100644 (file)
@@ -36,7 +36,7 @@ do {                                                            \
 
 #define CHECK_MEMBER_OFFSET(s,m)                                \
 do {                                                            \
-        CHECK_VALUE(offsetof(struct s, m));                     \
+        CHECK_VALUE((int)offsetof(struct s, m));                \
 } while(0)
 
 #define CHECK_MEMBER_SIZEOF(s,m)                                \