Whamcloud - gitweb
* landed unified portals (b_hd_cleanup_merge_singleportals) on HEAD
authoreeb <eeb>
Fri, 29 Oct 2004 15:06:22 +0000 (15:06 +0000)
committereeb <eeb>
Fri, 29 Oct 2004 15:06:22 +0000 (15:06 +0000)
95 files changed:
lnet/archdep.m4
lnet/build.m4
lnet/include/linux/.cvsignore [new file with mode: 0644]
lnet/include/linux/kp30.h
lnet/include/linux/libcfs.h
lnet/include/linux/portals_compat25.h
lnet/include/lnet/.cvsignore [new file with mode: 0644]
lnet/include/lnet/build_check.h
lnet/include/lnet/lnetctl.h
lnet/include/lnet/ptlctl.h
lnet/klnds/Makefile.in
lnet/klnds/autoMakefile.am
lnet/klnds/iiblnd/.cvsignore [new file with mode: 0644]
lnet/klnds/iiblnd/Makefile.in [new file with mode: 0644]
lnet/klnds/iiblnd/Makefile.mk [new file with mode: 0644]
lnet/klnds/iiblnd/autoMakefile.am [new file with mode: 0644]
lnet/klnds/iiblnd/iiblnd.c [new file with mode: 0644]
lnet/klnds/iiblnd/iiblnd.h [new file with mode: 0644]
lnet/klnds/iiblnd/iiblnd_cb.c [new file with mode: 0644]
lnet/klnds/openiblnd/.cvsignore [new file with mode: 0644]
lnet/klnds/openiblnd/openiblnd.c
lnet/klnds/openiblnd/openiblnd.h
lnet/klnds/openiblnd/openiblnd_cb.c
lnet/klnds/qswlnd/qswlnd.c
lnet/klnds/qswlnd/qswlnd.h
lnet/klnds/scimaclnd/scimacnal.c
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/libcfs/debug.c
lnet/libcfs/module.c
lnet/libcfs/tracefile.c
lnet/lnet/lib-move.c
lnet/lnet/module.c
lnet/router/proc.c
lnet/ulnds/connection.c
lnet/ulnds/dispatch.h
lnet/ulnds/procapi.c
lnet/ulnds/select.c
lnet/ulnds/socklnd/connection.c
lnet/ulnds/socklnd/dispatch.h
lnet/ulnds/socklnd/procapi.c
lnet/ulnds/socklnd/select.c
lnet/ulnds/socklnd/tcplnd.c
lnet/ulnds/tcplnd.c
lnet/utils/acceptor.c
lnet/utils/debug.c
lnet/utils/portals.c
lustre/configure.in
lustre/kernel_patches/patches/kksymoops-2.4.24.vanilla.patch
lustre/portals/archdep.m4
lustre/portals/build.m4
lustre/portals/include/linux/.cvsignore [new file with mode: 0644]
lustre/portals/include/linux/kp30.h
lustre/portals/include/linux/libcfs.h
lustre/portals/include/linux/portals_compat25.h
lustre/portals/include/portals/.cvsignore [new file with mode: 0644]
lustre/portals/include/portals/build_check.h
lustre/portals/include/portals/ptlctl.h
lustre/portals/knals/Makefile.in
lustre/portals/knals/autoMakefile.am
lustre/portals/knals/iibnal/.cvsignore [new file with mode: 0644]
lustre/portals/knals/iibnal/Makefile.in [new file with mode: 0644]
lustre/portals/knals/iibnal/Makefile.mk [new file with mode: 0644]
lustre/portals/knals/iibnal/autoMakefile.am [new file with mode: 0644]
lustre/portals/knals/iibnal/iibnal.c [new file with mode: 0644]
lustre/portals/knals/iibnal/iibnal.h [new file with mode: 0644]
lustre/portals/knals/iibnal/iibnal_cb.c [new file with mode: 0644]
lustre/portals/knals/openibnal/.cvsignore [new file with mode: 0644]
lustre/portals/knals/openibnal/openibnal.c
lustre/portals/knals/openibnal/openibnal.h
lustre/portals/knals/openibnal/openibnal_cb.c
lustre/portals/knals/qswnal/qswnal.c
lustre/portals/knals/qswnal/qswnal.h
lustre/portals/knals/scimacnal/scimacnal.c
lustre/portals/knals/socknal/socknal.c
lustre/portals/knals/socknal/socknal.h
lustre/portals/knals/socknal/socknal_cb.c
lustre/portals/libcfs/debug.c
lustre/portals/libcfs/module.c
lustre/portals/libcfs/tracefile.c
lustre/portals/portals/lib-move.c
lustre/portals/portals/module.c
lustre/portals/router/proc.c
lustre/portals/unals/connection.c
lustre/portals/unals/dispatch.h
lustre/portals/unals/procapi.c
lustre/portals/unals/select.c
lustre/portals/unals/tcpnal.c
lustre/portals/utils/acceptor.c
lustre/portals/utils/debug.c
lustre/portals/utils/portals.c
lustre/utils/lconf
lustre/utils/lmc
lustre/utils/obd.c

index d2bd1a1..021fa68 100644 (file)
@@ -14,26 +14,107 @@ AC_MSG_RESULT([$enable_inkernel])
 AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes)
 
 # -------- are we building against an external portals? -------
-AC_MSG_CHECKING([if Cray portals should be used])
+AC_MSG_CHECKING([for Cray portals])
 AC_ARG_WITH([cray-portals],
        AC_HELP_STRING([--with-cray-portals=path],
                       [path to cray portals]),
        [
                if test "$with_cray_portals" != no; then
-                       if test -r $with_cray_portals/include/portals/api.h ; then
-                               CRAY_PORTALS_PATH=$with_cray_portals
-                               CRAY_PORTALS_INCLUDE="-I$with_cray_portals/include"
-                               AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals])
-                       else
-                               AC_MSG_ERROR([--with-cray-portals specified badly])
-                       fi
-               fi
+                       CRAY_PORTALS_PATH=$with_cray_portals
+                       CRAY_PORTALS_INCLUDES="$with_cray_portals/include"
+                       CRAY_PORTALS_LIBS="$with_cray_portals"
+                fi
        ],[with_cray_portals=no])
 AC_SUBST(CRAY_PORTALS_PATH)
-AC_MSG_RESULT([$with_cray_portals])
+AC_MSG_RESULT([$CRAY_PORTALS_PATH])
+
+AC_MSG_CHECKING([for Cray portals includes])
+AC_ARG_WITH([cray-portals-includes],
+       AC_HELP_STRING([--with-cray-portals-includes=path],
+                      [path to cray portals includes]),
+       [
+               if test "$with_cray_portals_includes" != no; then
+                       CRAY_PORTALS_INCLUDES="$with_cray_portals_includes"
+                fi
+       ])
+AC_SUBST(CRAY_PORTALS_INCLUDES)
+AC_MSG_RESULT([$CRAY_PORTALS_INCLUDES])
+
+AC_MSG_CHECKING([for Cray portals libs])
+AC_ARG_WITH([cray-portals-libs],
+       AC_HELP_STRING([--with-cray-portals-libs=path],
+                      [path to cray portals libs]),
+       [
+               if test "$with_cray_portals_libs" != no; then
+                       CRAY_PORTALS_LIBS="$with_cray_portals_libs"
+                fi
+       ])
+AC_SUBST(CRAY_PORTALS_LIBS)
+AC_MSG_RESULT([$CRAY_PORTALS_LIBS])
+
+if test x$CRAY_PORTALS_INCLUDES != x ; then
+       if test ! -r $CRAY_PORTALS_INCLUDES/portals/api.h ; then
+               AC_MSG_ERROR([Cray portals headers were not found in $CRAY_PORTALS_INCLUDES.  Please check the paths passed to --with-cray-portals or --with-cray-portals-includes.])
+       fi
+fi
+if test x$CRAY_PORTALS_LIBS != x ; then
+       if test ! -r $CRAY_PORTALS_LIBS/libportals.a ; then
+               AC_MSG_ERROR([Cray portals libraries were not found in $CRAY_PORTALS_LIBS.  Please check the paths passed to --with-cray-portals or --with-cray-portals-libs.])
+       fi
+fi
 
+AC_MSG_CHECKING([whether to use Cray portals])
+if test x$CRAY_PORTALS_INCLUDES != x -a x$CRAY_PORTALS_LIBS != x ; then
+       with_cray_portals=yes
+       AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals])
+       CRAY_PORTALS_INCLUDES="-I$CRAY_PORTALS_INCLUDES"
+else
+       with_cray_portals=no
+fi
+AC_MSG_RESULT([$with_cray_portals])
 AM_CONDITIONAL(CRAY_PORTALS, test x$with_cray_portals != xno)
 
+# ----------------------------------------
+# some tests for catamount-like systems
+# ----------------------------------------
+AC_ARG_ENABLE([sysio_init],
+       AC_HELP_STRING([--disable-sysio-init],
+               [call sysio init functions when initializing liblustre]),
+       [],[enable_sysio_init=yes])
+AC_MSG_CHECKING([whether to initialize libsysio])
+AC_MSG_RESULT([$enable_sysio_init])
+if test x$enable_sysio_init != xno ; then
+       AC_DEFINE([INIT_SYSIO], 1, [call sysio init functions])
+fi
+
+AC_ARG_ENABLE([urandom],
+       AC_HELP_STRING([--disable-urandom],
+               [disable use of /dev/urandom for liblustre]),
+       [],[enable_urandom=yes])
+AC_MSG_CHECKING([whether to use /dev/urandom for liblustre])
+AC_MSG_RESULT([$enable_urandom])
+if test x$enable_urandom != xno ; then
+       AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data])
+fi
+
+# -------- check for -lcap and -lpthread ----
+if test x$enable_liblustre = xyes ; then
+       AC_CHECK_LIB([cap], [cap_get_proc],
+               [
+                       CAP_LIBS="-lcap"
+                       AC_DEFINE([HAVE_LIBCAP], 1, [use libcap])
+               ],
+               [CAP_LIBS=""])
+       AC_SUBST(CAP_LIBS)
+       AC_CHECK_LIB([pthread], [pthread_create],
+               [
+                       PTHREAD_LIBS="-lpthread"
+                       AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread])
+               ],
+               [PTHREAD_LIBS=""])
+       AC_SUBST(PTHREAD_LIBS)
+fi
+
 # -------- enable tests and utils? -------
 if test x$enable_tests = xno ; then
        AC_MSG_NOTICE([disabling tests])
@@ -128,7 +209,7 @@ AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno)
 
 # -------  Makeflags ------------------
 
-CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
+CPPFLAGS="$CPPFLAGS $CRAY_PORTALS_INCLUDES -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
 
 # liblustre are all the same
 LLCPPFLAGS="-D__arch_lib__ -D_LARGEFILE64_SOURCE=1"
@@ -146,7 +227,7 @@ if test x$enable_ldiskfs = xyes ; then
        AC_DEFINE(CONFIG_LDISKFS_FS_SECURITY, 1, [enable fs security])
 fi
 
-EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I$PWD/portals/include -I$PWD/include"
+EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDES -I$PWD/portals/include -I$PWD/include"
 
 # these are like AC_TRY_COMPILE, but try to build modules against the
 # kernel, inside the kernel-tests directory
@@ -408,6 +489,35 @@ if test x$enable_modules != xno ; then
        AC_SUBST(OPENIBCPPFLAGS)
        AC_SUBST(OPENIBNAL)
 
+       #### Infinicon IB
+       AC_MSG_CHECKING([if Infinicon IB kernel headers are present])
+       # for how the only infinicon ib build has headers in /usr/include/iba
+       IIBCPPFLAGS="-I/usr/include -DIN_TREE_BUILD"
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS $IIBCPPFLAGS"
+       LUSTRE_MODULE_TRY_COMPILE(
+               [
+                       #include <linux/iba/ibt.h>
+               ],[
+                       IBT_INTERFACE_UNION interfaces;
+                       FSTATUS             rc;
+
+                       rc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2,
+                                                     &interfaces);
+
+                       return rc == FSUCCESS ? 0 : 1;
+               ],[
+                       AC_MSG_RESULT([yes])
+                       IIBNAL="iibnal"
+               ],[
+                       AC_MSG_RESULT([no])
+                       IIBNAL=""
+                       IIBCPPFLAGS=""
+               ])
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+       AC_SUBST(IIBCPPFLAGS)
+       AC_SUBST(IIBNAL)
+
        # ---------- Red Hat 2.4.18 has iobuf->dovary --------------
        # But other kernels don't
 
@@ -667,15 +777,34 @@ fi
 AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal")
 AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal")
 AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal")
+AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal")
+
+# portals/utils/portals.c
+AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h])
+AC_CHECK_FUNCS([gethostbyname socket connect])
+
+# portals/utils/debug.c
+AC_CHECK_HEADERS([linux/version.h])
+
+# include/liblustre.h
+AC_CHECK_HEADERS([asm/page.h sys/user.h stdint.h])
+
+# liblustre/llite_lib.h
+AC_CHECK_HEADERS([xtio.h file.h])
+
+# liblustre/dir.c
+AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h])
+
+# liblustre/lutil.c
+AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h])
+AC_CHECK_FUNCS([inet_ntoa])
 
 CPPFLAGS="-include \$(top_builddir)/include/config.h $CPPFLAGS"
 EXTRA_KCFLAGS="-include $PWD/include/config.h $EXTRA_KCFLAGS"
 AC_SUBST(EXTRA_KCFLAGS)
 
-#echo "KCPPFLAGS: $KCPPFLAGS"
-#echo "KCFLAGS: $KCFLAGS"
-#echo "LLCPPFLAGS: $LLCPPFLAGS"
-#echo "LLCFLAGS: $LLCFLAGS"
-#echo "MOD_LINK: $MOD_LINK"
-#echo "CFLAGS: $CFLAGS"
-#echo "CPPFLAGS: $CPPFLAGS"
+echo "CPPFLAGS: $CPPFLAGS"
+echo "LLCPPFLAGS: $LLCPPFLAGS"
+echo "CFLAGS: $CFLAGS"
+echo "EXTRA_KCFLAGS: $EXTRA_KCFLAGS"
+echo "LLCFLAGS: $LLCFLAGS"
index 861bb4a..f158396 100644 (file)
@@ -61,6 +61,13 @@ case "$CC_VERSION" in
        "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)")
                bad_cc
                ;;
+       # unpatched 'gcc' on rh9.  miscompiles a
+       #        struct = (type) { .member = value, };
+       # asignment in the iibnal where the struct is a mix
+       # of u64 and u32 bit-fields.
+       "gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5)")
+               bad_cc
+               ;;
        *)
                AC_MSG_RESULT([no known problems])
                ;;
@@ -116,3 +123,5 @@ else
        LIBWRAP=""
 fi
 AC_SUBST(LIBWRAP)
+
+AC_SUBST(LIBS)
diff --git a/lnet/include/linux/.cvsignore b/lnet/include/linux/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
index db63a08..4e24c71 100644 (file)
@@ -294,7 +294,6 @@ extern void kportal_blockallsigs (void);
 # include <unistd.h>
 # include <time.h>
 # include <limits.h>
-# include <asm/types.h>
 # ifndef DEBUG_SUBSYSTEM
 #  define DEBUG_SUBSYSTEM S_UNDEFINED
 # endif
@@ -320,6 +319,11 @@ void portals_debug_dumplog(void);
     printf("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format,                    \
            (subsys), (mask), (long)time(0), file, fn, line,                   \
            getpid() , stack, ## a);
+
+#undef CWARN
+#undef CERROR
+#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
 #endif
 
 /* support decl needed both by kernel and liblustre */
@@ -338,6 +342,16 @@ char *portals_id2str(int nal, ptl_process_id_t nid, char *str);
 #define LWT_MEMORY   (16<<20)
 
 #if !KLWT_SUPPORT
+# if defined(__KERNEL__)
+#  if !defined(BITS_PER_LONG)
+#   error "BITS_PER_LONG not defined"
+#  endif
+# elif !defined(__WORDSIZE)
+#  error "__WORDSIZE not defined"
+# else
+#  define BITS_PER_LONG __WORDSIZE
+# endif
+
 /* kernel hasn't defined this? */
 typedef struct {
         long long   lwte_when;
@@ -572,49 +586,42 @@ static inline int portal_ioctl_getdata(char *buf, char *end, void *arg)
         data = (struct portal_ioctl_data *)buf;
 
         err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
-        if ( err ) {
-                EXIT;
-                return err;
-        }
+        if (err)
+                RETURN(err);
 
         if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
-                CERROR ("PORTALS: version mismatch kernel vs application\n");
-                return -EINVAL;
+                CERROR("PORTALS: version mismatch kernel vs application\n");
+                RETURN(-EINVAL);
         }
 
         if (hdr->ioc_len + buf >= end) {
-                CERROR ("PORTALS: user buffer exceeds kernel buffer\n");
-                return -EINVAL;
+                CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+                RETURN(-EINVAL);
         }
 
 
         if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
-                CERROR ("PORTALS: user buffer too small for ioctl\n");
-                return -EINVAL;
+                CERROR("PORTALS: user buffer too small for ioctl\n");
+                RETURN(-EINVAL);
         }
 
         err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
-        if ( err ) {
-                EXIT;
-                return err;
-        }
+        if (err)
+                RETURN(err);
 
         if (portal_ioctl_is_invalid(data)) {
-                CERROR ("PORTALS: ioctl not correctly formatted\n");
-                return -EINVAL;
+                CERROR("PORTALS: ioctl not correctly formatted\n");
+                RETURN(-EINVAL);
         }
 
-        if (data->ioc_inllen1) {
+        if (data->ioc_inllen1)
                 data->ioc_inlbuf1 = &data->ioc_bulk[0];
-        }
 
-        if (data->ioc_inllen2) {
+        if (data->ioc_inllen2)
                 data->ioc_inlbuf2 = &data->ioc_bulk[0] +
                         size_round(data->ioc_inllen1);
-        }
 
-        EXIT;
-        return 0;
+        RETURN(0);
 }
 #endif
 
@@ -645,10 +652,11 @@ enum {
         TCPNAL    = 5,
         ROUTER    = 6,
         OPENIBNAL = 7,
+        IIBNAL    = 8,
         NAL_ENUM_END_MARKER
 };
 
-#define PTL_NALFMT_SIZE              30 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+4+1) */
+#define PTL_NALFMT_SIZE             32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */
 
 #define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
 
index d1a5c44..8317f14 100644 (file)
@@ -4,7 +4,11 @@
 #ifndef _LIBCFS_H
 #define _LIBCFS_H
 
+#ifdef HAVE_ASM_TYPES_H
 #include <asm/types.h>
+#else
+#include "types.h"
+#endif
 
 #ifdef __KERNEL__
 # include <linux/time.h>
@@ -62,7 +66,6 @@ extern unsigned int portal_stack;
 extern unsigned int portal_debug;
 extern unsigned int portal_printk;
 
-#include <asm/types.h>
 struct ptldebug_header {
         __u32 ph_len;
         __u32 ph_flags;
@@ -102,7 +105,7 @@ struct ptldebug_header {
 #define S_GMNAL       0x00080000
 #define S_PTLROUTER   0x00100000
 #define S_COBD        0x00200000
-#define S_OPENIBNAL   0x00400000
+#define S_IBNAL       0x00400000 /* All IB NALs */
 #define S_SM          0x00800000
 #define S_ASOBD       0x01000000
 #define S_LMV         0x02000000
@@ -185,8 +188,40 @@ do {                                                                          \
                                   CDEBUG_STACK, format, ## a);                \
 } while (0)
 
-#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
-#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
+#define CDEBUG_MAX_LIMIT 600
+#define CDEBUG_LIMIT(cdebug_mask, cdebug_format, a...)                        \
+do {                                                                          \
+        static unsigned long cdebug_next;                                     \
+        static int cdebug_count, cdebug_delay = 1;                            \
+                                                                              \
+        CHECK_STACK(CDEBUG_STACK);                                            \
+        if (time_after(jiffies, cdebug_next)) {                               \
+                portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, __FILE__,     \
+                                  __FUNCTION__, __LINE__, CDEBUG_STACK,       \
+                                  cdebug_format, ## a);                       \
+                if (cdebug_count) {                                           \
+                        portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask,       \
+                                          __FILE__, __FUNCTION__, __LINE__,   \
+                                          CDEBUG_STACK, cdebug_format, ## a); \
+                        cdebug_count = 0;                                     \
+                }                                                             \
+                if (time_after(jiffies, cdebug_next+(CDEBUG_MAX_LIMIT+10)*HZ))\
+                        cdebug_delay = cdebug_delay > 8 ? cdebug_delay/8 : 1; \
+                else                                                          \
+                        cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ ?\
+                                        CDEBUG_MAX_LIMIT*HZ : cdebug_delay*2; \
+                cdebug_next = jiffies + cdebug_delay;                         \
+        } else {                                                              \
+                portals_debug_msg(DEBUG_SUBSYSTEM,                            \
+                                  portal_debug & ~(D_EMERG|D_ERROR|D_WARNING),\
+                                  __FILE__, __FUNCTION__, __LINE__,           \
+                                  CDEBUG_STACK, cdebug_format, ## a);         \
+                cdebug_count++;                                               \
+        }                                                                     \
+} while (0)
+
+#define CWARN(format, a...) CDEBUG_LIMIT(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG_LIMIT(D_ERROR, format, ## a)
 #define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a)
 
 #define GOTO(label, rc)                                                 \
@@ -229,14 +264,13 @@ do {                                                                    \
 /* initial pid  */
 # if CRAY_PORTALS
 /* 
+ * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this
+ *    is too big.
  *
- * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this is too
- * big.
- *
- * 2) the implementation of ernal in cray portals further restricts the pid space
- * that may be used to 0 <= pid <= 255 (an 8 bit value).  Returns an error at nal
- * init time for any pid outside this range.  Other nals in cray portals don't have
- * this restriction.
+ * 2) the implementation of ernal in cray portals further restricts the pid
+ *    space that may be used to 0 <= pid <= 255 (an 8 bit value).  Returns
+ *    an error at nal init time for any pid outside this range.  Other nals
+ *    in cray portals don't have this restriction.
  * */
 #define LUSTRE_PTL_PID          9
 # else
index 7fe6dfc..5a43a45 100644 (file)
@@ -28,6 +28,8 @@
   call_usermodehelper(path, argv, envp, 1)
 # define RECALC_SIGPENDING         recalc_sigpending()
 # define CURRENT_SECONDS           get_seconds()
+# define smp_num_cpus              NR_CPUS
+
 
 #elif defined(CONFIG_RH_2_4_20) /* RH 2.4.x */
 
diff --git a/lnet/include/lnet/.cvsignore b/lnet/include/lnet/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
index 5db1352..c219d2a 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef _BUILD_CHECK_H
 #define _BUILD_CHECK_H
 
-#ifdef CRAY_PORTALS
+#if CRAY_PORTALS
 #error "an application got to me instead of cray's includes"
 #endif
 
index a81a371..cfddde2 100644 (file)
@@ -31,8 +31,6 @@
 #define PORTALS_DEV_PATH "/dev/portals"
 #define OBD_DEV_ID 1
 #define OBD_DEV_PATH "/dev/obd"
-#define SMFS_DEV_ID  2
-#define SMFS_DEV_PATH "/dev/snapdev"
 
 int ptl_name2nal(char *str);
 int ptl_parse_ipaddr (__u32 *ipaddrp, char *str);
@@ -41,9 +39,6 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid);
 
 int ptl_initialize(int argc, char **argv);
 int jt_ptl_network(int argc, char **argv);
-int jt_ptl_print_autoconnects (int argc, char **argv);
-int jt_ptl_add_autoconnect (int argc, char **argv);
-int jt_ptl_del_autoconnect (int argc, char **argv);
 int jt_ptl_print_interfaces(int argc, char **argv);
 int jt_ptl_add_interface(int argc, char **argv);
 int jt_ptl_del_interface(int argc, char **argv);
@@ -62,9 +57,6 @@ int jt_ptl_add_uuid(int argc, char **argv);
 int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
 int jt_ptl_close_uuid(int argc, char **argv);
 int jt_ptl_del_uuid(int argc, char **argv);
-int jt_ptl_rxmem (int argc, char **argv);
-int jt_ptl_txmem (int argc, char **argv);
-int jt_ptl_nagle (int argc, char **argv);
 int jt_ptl_add_route (int argc, char **argv);
 int jt_ptl_del_route (int argc, char **argv);
 int jt_ptl_notify_router (int argc, char **argv);
index a81a371..cfddde2 100644 (file)
@@ -31,8 +31,6 @@
 #define PORTALS_DEV_PATH "/dev/portals"
 #define OBD_DEV_ID 1
 #define OBD_DEV_PATH "/dev/obd"
-#define SMFS_DEV_ID  2
-#define SMFS_DEV_PATH "/dev/snapdev"
 
 int ptl_name2nal(char *str);
 int ptl_parse_ipaddr (__u32 *ipaddrp, char *str);
@@ -41,9 +39,6 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid);
 
 int ptl_initialize(int argc, char **argv);
 int jt_ptl_network(int argc, char **argv);
-int jt_ptl_print_autoconnects (int argc, char **argv);
-int jt_ptl_add_autoconnect (int argc, char **argv);
-int jt_ptl_del_autoconnect (int argc, char **argv);
 int jt_ptl_print_interfaces(int argc, char **argv);
 int jt_ptl_add_interface(int argc, char **argv);
 int jt_ptl_del_interface(int argc, char **argv);
@@ -62,9 +57,6 @@ int jt_ptl_add_uuid(int argc, char **argv);
 int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
 int jt_ptl_close_uuid(int argc, char **argv);
 int jt_ptl_del_uuid(int argc, char **argv);
-int jt_ptl_rxmem (int argc, char **argv);
-int jt_ptl_txmem (int argc, char **argv);
-int jt_ptl_nagle (int argc, char **argv);
 int jt_ptl_add_route (int argc, char **argv);
 int jt_ptl_del_route (int argc, char **argv);
 int jt_ptl_notify_router (int argc, char **argv);
index 2a01119..9763d14 100644 (file)
@@ -1,5 +1,6 @@
 @BUILD_GMNAL_TRUE@subdir-m += gmnal
 @BUILD_OPENIBNAL_TRUE@subdir-m += openibnal
+@BUILD_IIBNAL_TRUE@subdir-m += iibnal
 @BUILD_QSWNAL_TRUE@subdir-m += qswnal
 subdir-m += socknal
 
index 002c169..0090364 100644 (file)
@@ -3,4 +3,4 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-SUBDIRS = gmnal openibnal qswnal socknal 
+SUBDIRS = gmnal iibnal openibnal qswnal socknal 
diff --git a/lnet/klnds/iiblnd/.cvsignore b/lnet/klnds/iiblnd/.cvsignore
new file mode 100644 (file)
index 0000000..5ed596b
--- /dev/null
@@ -0,0 +1,10 @@
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
diff --git a/lnet/klnds/iiblnd/Makefile.in b/lnet/klnds/iiblnd/Makefile.in
new file mode 100644 (file)
index 0000000..e7934e2
--- /dev/null
@@ -0,0 +1,6 @@
+MODULES := kiibnal
+kiibnal-objs := iibnal.o iibnal_cb.o
+
+EXTRA_POST_CFLAGS := @IIBCPPFLAGS@
+
+@INCLUDE_RULES@
diff --git a/lnet/klnds/iiblnd/Makefile.mk b/lnet/klnds/iiblnd/Makefile.mk
new file mode 100644 (file)
index 0000000..0459a20
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(src)/../../Kernelenv
+
+obj-y += kiibnal.o
+kiibnal-objs := iibnal.o iibnal_cb.o
+
diff --git a/lnet/klnds/iiblnd/autoMakefile.am b/lnet/klnds/iiblnd/autoMakefile.am
new file mode 100644 (file)
index 0000000..251df66
--- /dev/null
@@ -0,0 +1,15 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if !CRAY_PORTALS
+if BUILD_IIBNAL
+modulenet_DATA = kiibnal$(KMODEXT)
+endif
+endif
+endif
+
+MOSTLYCLEANFILES = *.o *.ko *.mod.c
+DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h
diff --git a/lnet/klnds/iiblnd/iiblnd.c b/lnet/klnds/iiblnd/iiblnd.c
new file mode 100644 (file)
index 0000000..09908c9
--- /dev/null
@@ -0,0 +1,1713 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+nal_t                   kibnal_api;
+ptl_handle_ni_t         kibnal_ni;
+kib_tunables_t          kibnal_tunables;
+
+kib_data_t              kibnal_data = {
+        .kib_service_id = IBNAL_SERVICE_NUMBER,
+};
+
+#ifdef CONFIG_SYSCTL
+#define IBNAL_SYSCTL             202
+
+#define IBNAL_SYSCTL_TIMEOUT     1
+
+static ctl_table kibnal_ctl_table[] = {
+        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
+         &kibnal_tunables.kib_io_timeout, sizeof (int),
+         0644, NULL, &proc_dointvec},
+        { 0 }
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+        {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
+        { 0 }
+};
+#endif
+
+#ifdef unused
+void
+print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+{
+        char name[32];
+
+        if (service == NULL) 
+        {
+                CWARN("tag       : %s\n"
+                      "status    : %d (NULL)\n", tag, rc);
+                return;
+        }
+        strncpy (name, service->ServiceName, sizeof(name)-1);
+        name[sizeof(name)-1] = 0;
+        
+        CWARN("tag       : %s\n"
+              "status    : %d\n"
+              "service id: "LPX64"\n"
+              "name      : %s\n"
+              "NID       : "LPX64"\n", tag, rc,
+              service->RID.ServiceID, name,
+              *kibnal_service_nid_field(service));
+}
+#endif
+
+static void
+kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
+                              FSTATUS frc, uint32 madrc)
+{
+        *(FSTATUS *)arg = frc;
+        up (&kibnal_data.kib_nid_signal);
+}
+
+#if IBNAL_CHECK_ADVERT
+static void
+kibnal_service_query_done (void *arg, QUERY *qry, 
+                           QUERY_RESULT_VALUES *qry_result)
+{
+        FSTATUS frc = qry_result->Status;
+
+        if (frc != FSUCCESS &&
+            qry_result->ResultDataSize == 0)
+                frc = FERROR;
+        
+        *(FSTATUS *)arg = frc;
+        up (&kibnal_data.kib_nid_signal);
+}
+
+static void
+kibnal_check_advert (void)
+{
+        QUERY                  *qry;
+        IB_SERVICE_RECORD      *svc;
+        FSTATUS                 frc;
+        FSTATUS                 frc2;
+
+        PORTAL_ALLOC(qry, sizeof(*qry));
+        if (qry == NULL)
+                return;
+
+        memset (qry, 0, sizeof(*qry));
+        qry->InputType = InputTypeServiceRecord;
+        qry->OutputType = OutputTypeServiceRecord;
+        qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+        svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+                                                    kibnal_data.kib_port_guid,
+                                                    qry,
+                                                    kibnal_service_query_done,
+                                                    NULL, &frc2);
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("Immediate error %d checking SM service\n", frc);
+        } else {
+                down (&kibnal_data.kib_nid_signal);
+                frc = frc2;
+
+                if (frc != 0)
+                        CERROR ("Error %d checking SM service\n", rc);
+        }
+
+        return (rc);
+}
+#endif
+
+static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
+{
+        IB_SERVICE_RECORD     *svc;
+
+        memset (fod, 0, sizeof(*fod));
+        fod->Type = type;
+
+        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+        svc->RID.ServiceID = kibnal_data.kib_service_id;
+        svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
+        svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
+        svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
+        svc->ServiceLease = 0xffffffff;
+
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+}
+
+static int
+kibnal_advertise (void)
+{
+        FABRIC_OPERATION_DATA *fod;
+        IB_SERVICE_RECORD     *svc;
+        FSTATUS                frc;
+        FSTATUS                frc2;
+
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC(fod, sizeof(*fod));
+        if (fod == NULL)
+                return (-ENOMEM);
+
+        fill_fod(fod, FabOpSetServiceRecord);
+        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+        CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
+               svc->RID.ServiceID, 
+               svc->ServiceName, *kibnal_service_nid_field(svc));
+
+        frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+                                            kibnal_data.kib_port_guid,
+                                            fod, kibnal_service_setunset_done, 
+                                            NULL, &frc2);
+
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("Immediate error %d advertising NID "LPX64"\n",
+                        frc, kibnal_data.kib_nid);
+                goto out;
+        }
+
+        down (&kibnal_data.kib_nid_signal);
+
+        frc = frc2;
+        if (frc != FSUCCESS)
+                CERROR ("Error %d advertising BUD "LPX64"\n",
+                        frc, kibnal_data.kib_nid);
+out:
+        PORTAL_FREE(fod, sizeof(*fod));
+        return (frc == FSUCCESS) ? 0 : -EINVAL;
+}
+
+static void
+kibnal_unadvertise (int expect_success)
+{
+        FABRIC_OPERATION_DATA *fod;
+        IB_SERVICE_RECORD     *svc;
+        FSTATUS                frc;
+        FSTATUS                frc2;
+
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC(fod, sizeof(*fod));
+        if (fod == NULL)
+                return;
+
+        fill_fod(fod, FabOpDeleteServiceRecord);
+        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+        CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
+               svc->ServiceName, *kibnal_service_nid_field(svc));
+        
+        frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+                                            kibnal_data.kib_port_guid,
+                                            fod, kibnal_service_setunset_done, 
+                                            NULL, &frc2);
+
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
+                        frc, kibnal_data.kib_nid);
+                goto out;
+        }
+
+        down (&kibnal_data.kib_nid_signal);
+
+        if ((frc2 == FSUCCESS) == !!expect_success)
+                goto out;
+
+        if (expect_success)
+                CERROR("Error %d unadvertising NID "LPX64"\n",
+                       frc2, kibnal_data.kib_nid);
+        else
+                CWARN("Removed conflicting NID "LPX64"\n",
+                      kibnal_data.kib_nid);
+ out:
+        PORTAL_FREE(fod, sizeof(*fod));
+}
+
+static int
+kibnal_set_mynid(ptl_nid_t nid)
+{
+        struct timeval tv;
+        lib_ni_t      *ni = &kibnal_lib.libnal_ni;
+        int            rc;
+        FSTATUS        frc;
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+               nid, ni->ni_pid.nid);
+
+        do_gettimeofday(&tv);
+
+        down (&kibnal_data.kib_nid_mutex);
+
+        if (nid == kibnal_data.kib_nid) {
+                /* no change of NID */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+
+        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
+               kibnal_data.kib_nid, nid);
+        
+        if (kibnal_data.kib_nid != PTL_NID_ANY) {
+
+                kibnal_unadvertise (1);
+
+                frc = iibt_cm_cancel(kibnal_data.kib_cep);
+                if (frc != FSUCCESS && frc != FPENDING)
+                        CERROR ("Error %d stopping listener\n", frc);
+
+                frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
+                if (frc != FSUCCESS)
+                        CERROR ("Error %d destroying CEP\n", frc);
+
+                kibnal_data.kib_cep = NULL;
+        }
+        
+        kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+        
+        /* Delete all existing peers and their connections after new
+         * NID/incarnation set to ensure no old connections in our brave
+         * new world. */
+        kibnal_del_peer (PTL_NID_ANY, 0);
+
+        if (kibnal_data.kib_nid == PTL_NID_ANY) {
+                /* No new NID to install */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+
+        /* remove any previous advert (crashed node etc) */
+        kibnal_unadvertise(0);
+
+        kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
+        if (kibnal_data.kib_cep == NULL) {
+                CERROR ("Can't create CEP\n");
+                rc = -ENOMEM;
+        } else {
+                CM_LISTEN_INFO info;
+                memset (&info, 0, sizeof(info));
+                info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
+
+                frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
+                                     kibnal_listen_callback, NULL);
+                if (frc != FSUCCESS && frc != FPENDING) {
+                        CERROR ("iibt_cm_listen error: %d\n", frc);
+                        rc = -EINVAL;
+                } else {
+                        rc = 0;
+                }
+        }
+        
+        if (rc == 0) {
+                rc = kibnal_advertise();
+                if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+                        kibnal_check_advert();
+#endif
+                        up (&kibnal_data.kib_nid_mutex);
+                        return (0);
+                }
+                
+                iibt_cm_cancel (kibnal_data.kib_cep);
+                iibt_cm_destroy_cep (kibnal_data.kib_cep);
+                /* remove any peers that sprung up while I failed to
+                 * advertise myself */
+                kibnal_del_peer (PTL_NID_ANY, 0);
+        }
+
+        kibnal_data.kib_nid = PTL_NID_ANY;
+        up (&kibnal_data.kib_nid_mutex);
+        return (rc);
+}
+
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
+{
+        kib_peer_t *peer;
+
+        LASSERT (nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC (peer, sizeof (*peer));
+        if (peer == NULL)
+                return (NULL);
+
+        memset(peer, 0, sizeof(*peer));         /* zero flags etc */
+
+        peer->ibp_nid = nid;
+        atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
+
+        INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
+        INIT_LIST_HEAD (&peer->ibp_conns);
+        INIT_LIST_HEAD (&peer->ibp_tx_queue);
+
+        peer->ibp_reconnect_time = jiffies;
+        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+        atomic_inc (&kibnal_data.kib_npeers);
+        return (peer);
+}
+
+void
+kibnal_destroy_peer (kib_peer_t *peer)
+{
+
+        LASSERT (atomic_read (&peer->ibp_refcount) == 0);
+        LASSERT (peer->ibp_persistence == 0);
+        LASSERT (!kibnal_peer_active(peer));
+        LASSERT (peer->ibp_connecting == 0);
+        LASSERT (list_empty (&peer->ibp_conns));
+        LASSERT (list_empty (&peer->ibp_tx_queue));
+
+        PORTAL_FREE (peer, sizeof (*peer));
+
+        /* NB a peer's connections keep a reference on their peer until
+         * they are destroyed, so we can be assured that _all_ state to do
+         * with this peer has been cleaned up when its refcount drops to
+         * zero. */
+        atomic_dec (&kibnal_data.kib_npeers);
+}
+
+/* the caller is responsible for accounting for the additional reference
+ * that this creates */
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
+{
+        struct list_head *peer_list = kibnal_nid2peerlist (nid);
+        struct list_head *tmp;
+        kib_peer_t       *peer;
+
+        list_for_each (tmp, peer_list) {
+
+                peer = list_entry (tmp, kib_peer_t, ibp_list);
+
+                LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
+                         peer->ibp_connecting != 0 || /* creating conns */
+                         !list_empty (&peer->ibp_conns));  /* active conn */
+
+                if (peer->ibp_nid != nid)
+                        continue;
+
+                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
+                       peer, nid, atomic_read (&peer->ibp_refcount));
+                return (peer);
+        }
+        return (NULL);
+}
+
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
+{
+        kib_peer_t     *peer;
+
+        read_lock (&kibnal_data.kib_global_lock);
+        peer = kibnal_find_peer_locked (nid);
+        if (peer != NULL)                       /* +1 ref for caller? */
+                kib_peer_addref(peer);
+        read_unlock (&kibnal_data.kib_global_lock);
+
+        return (peer);
+}
+
+void
+kibnal_unlink_peer_locked (kib_peer_t *peer)
+{
+        LASSERT (peer->ibp_persistence == 0);
+        LASSERT (list_empty(&peer->ibp_conns));
+
+        LASSERT (kibnal_peer_active(peer));
+        list_del_init (&peer->ibp_list);
+        /* lose peerlist's ref */
+        kib_peer_decref(peer);
+}
+
+static int
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+{
+        kib_peer_t        *peer;
+        struct list_head  *ptmp;
+        int                i;
+
+        read_lock (&kibnal_data.kib_global_lock);
+
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (index-- > 0)
+                                continue;
+
+                        *nidp = peer->ibp_nid;
+                        *persistencep = peer->ibp_persistence;
+
+                        read_unlock (&kibnal_data.kib_global_lock);
+                        return (0);
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+        return (-ENOENT);
+}
+
+static int
+kibnal_add_persistent_peer (ptl_nid_t nid)
+{
+        unsigned long      flags;
+        kib_peer_t        *peer;
+        kib_peer_t        *peer2;
+        
+        if (nid == PTL_NID_ANY)
+                return (-EINVAL);
+
+        peer = kibnal_create_peer (nid);
+        if (peer == NULL)
+                return (-ENOMEM);
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        peer2 = kibnal_find_peer_locked (nid);
+        if (peer2 != NULL) {
+                kib_peer_decref (peer);
+                peer = peer2;
+        } else {
+                /* peer table takes existing ref on peer */
+                list_add_tail (&peer->ibp_list,
+                               kibnal_nid2peerlist (nid));
+        }
+
+        peer->ibp_persistence++;
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        return (0);
+}
+
+static void
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+{
+        struct list_head *ctmp;
+        struct list_head *cnxt;
+        kib_conn_t       *conn;
+
+        if (!single_share)
+                peer->ibp_persistence = 0;
+        else if (peer->ibp_persistence > 0)
+                peer->ibp_persistence--;
+
+        if (peer->ibp_persistence != 0)
+                return;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                kibnal_close_conn_locked (conn, 0);
+        }
+
+        /* NB peer unlinks itself when last conn is closed */
+}
+
+int
+kibnal_del_peer (ptl_nid_t nid, int single_share)
+{
+        unsigned long      flags;
+        struct list_head  *ptmp;
+        struct list_head  *pnxt;
+        kib_peer_t        *peer;
+        int                lo;
+        int                hi;
+        int                i;
+        int                rc = -ENOENT;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        if (nid != PTL_NID_ANY)
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+        else {
+                lo = 0;
+                hi = kibnal_data.kib_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+                                continue;
+
+                        kibnal_del_peer_locked (peer, single_share);
+                        rc = 0;         /* matched something */
+
+                        if (single_share)
+                                goto out;
+                }
+        }
+ out:
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        return (rc);
+}
+
+static kib_conn_t *
+kibnal_get_conn_by_idx (int index)
+{
+        kib_peer_t        *peer;
+        struct list_head  *ptmp;
+        kib_conn_t        *conn;
+        struct list_head  *ctmp;
+        int                i;
+
+        read_lock (&kibnal_data.kib_global_lock);
+
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence > 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        list_for_each (ctmp, &peer->ibp_conns) {
+                                if (index-- > 0)
+                                        continue;
+
+                                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+                                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                                       atomic_read (&conn->ibc_refcount));
+                                atomic_inc (&conn->ibc_refcount);
+                                read_unlock (&kibnal_data.kib_global_lock);
+                                return (conn);
+                        }
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+        return (NULL);
+}
+
+kib_conn_t *
+kibnal_create_conn (void)
+{
+        kib_conn_t  *conn;
+        int          i;
+        __u64        vaddr = 0;
+        __u64        vaddr_base;
+        int          page_offset;
+        int          ipage;
+        int          rc;
+        FSTATUS      frc;
+        union {
+                IB_QP_ATTRIBUTES_CREATE    qp_create;
+                IB_QP_ATTRIBUTES_MODIFY    qp_attr;
+        } params;
+        
+        PORTAL_ALLOC (conn, sizeof (*conn));
+        if (conn == NULL) {
+                CERROR ("Can't allocate connection\n");
+                return (NULL);
+        }
+
+        /* zero flags, NULL pointers etc... */
+        memset (conn, 0, sizeof (*conn));
+
+        INIT_LIST_HEAD (&conn->ibc_tx_queue);
+        INIT_LIST_HEAD (&conn->ibc_active_txs);
+        spin_lock_init (&conn->ibc_lock);
+        
+        atomic_inc (&kibnal_data.kib_nconns);
+        /* well not really, but I call destroy() on failure, which decrements */
+
+        PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+        if (conn->ibc_rxs == NULL)
+                goto failed;
+        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+        rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
+        if (rc != 0)
+                goto failed;
+
+        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
+
+        for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+                struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+                kib_rx_t   *rx = &conn->ibc_rxs[i];
+
+                rx->rx_conn = conn;
+                rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
+                             page_offset);
+
+                if (kibnal_whole_mem()) 
+                        rx->rx_vaddr = kibnal_page2phys(page) + 
+                                       page_offset + 
+                                       kibnal_data.kib_md.md_addr;
+                else
+                        rx->rx_vaddr = vaddr;
+                
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
+                
+                page_offset += IBNAL_MSG_SIZE;
+                LASSERT (page_offset <= PAGE_SIZE);
+
+                if (page_offset == PAGE_SIZE) {
+                        page_offset = 0;
+                        ipage++;
+                        LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
+                }
+        }
+
+        params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
+                .Type                    = QPTypeReliableConnected,
+                .SendQDepth              = IBNAL_TX_MAX_SG * 
+                                           IBNAL_MSG_QUEUE_SIZE,
+                .RecvQDepth              = IBNAL_MSG_QUEUE_SIZE,
+                .SendDSListDepth         = 1,
+                .RecvDSListDepth         = 1,
+                .SendCQHandle            = kibnal_data.kib_cq,
+                .RecvCQHandle            = kibnal_data.kib_cq,
+                .PDHandle                = kibnal_data.kib_pd,
+                .SendSignaledCompletions = TRUE,
+        };
+        frc = iibt_qp_create(kibnal_data.kib_hca, &params.qp_create, NULL,
+                             &conn->ibc_qp, &conn->ibc_qp_attrs);
+        if (rc != 0) {
+                CERROR ("Failed to create queue pair: %d\n", rc);
+                goto failed;
+        }
+
+        /* Mark QP created */
+        conn->ibc_state = IBNAL_CONN_INIT_QP;
+
+        params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState             = QPStateInit,
+                .Attrs                    = (IB_QP_ATTR_PORTGUID |
+                                             IB_QP_ATTR_PKEYINDEX |
+                                             IB_QP_ATTR_ACCESSCONTROL),
+                .PortGUID                 = kibnal_data.kib_port_guid,
+                .PkeyIndex                = 0,
+                .AccessControl = {
+                        .s = {
+                                .RdmaWrite = 1,
+                                .RdmaRead  = 1,
+                        },
+                },
+        };
+        rc = iibt_qp_modify(conn->ibc_qp, &params.qp_attr, NULL);
+        if (rc != 0) {
+                CERROR ("Failed to modify queue pair: %d\n", rc);
+                goto failed;
+        }
+
+        /* 1 ref for caller */
+        atomic_set (&conn->ibc_refcount, 1);
+        return (conn);
+        
+ failed:
+        kibnal_destroy_conn (conn);
+        return (NULL);
+}
+
+void
+kibnal_destroy_conn (kib_conn_t *conn)
+{
+        int    rc;
+        FSTATUS frc;
+        
+        CDEBUG (D_NET, "connection %p\n", conn);
+
+        LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+        LASSERT (list_empty(&conn->ibc_tx_queue));
+        LASSERT (list_empty(&conn->ibc_active_txs));
+        LASSERT (conn->ibc_nsends_posted == 0);
+        LASSERT (conn->ibc_connreq == NULL);
+
+        switch (conn->ibc_state) {
+        case IBNAL_CONN_DISCONNECTED:
+                /* called after connection sequence initiated */
+                /* fall through */
+
+        case IBNAL_CONN_INIT_QP:
+                /* _destroy includes an implicit Reset of the QP which 
+                 * discards posted work */
+                rc = iibt_qp_destroy(conn->ibc_qp);
+                if (rc != 0)
+                        CERROR("Can't destroy QP: %d\n", rc);
+                /* fall through */
+                
+        case IBNAL_CONN_INIT_NOTHING:
+                break;
+
+        default:
+                LASSERT (0);
+        }
+
+        if (conn->ibc_cep != NULL) {
+                frc = iibt_cm_destroy_cep(conn->ibc_cep);
+                if (frc != 0)
+                        CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, 
+                               frc);
+        }
+
+        if (conn->ibc_rx_pages != NULL) 
+                kibnal_free_pages(conn->ibc_rx_pages);
+        
+        if (conn->ibc_rxs != NULL)
+                PORTAL_FREE(conn->ibc_rxs, 
+                            IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+        if (conn->ibc_peer != NULL)
+                kib_peer_decref(conn->ibc_peer);
+
+        PORTAL_FREE(conn, sizeof (*conn));
+
+        atomic_dec(&kibnal_data.kib_nconns);
+        
+        if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+            kibnal_data.kib_shutdown) {
+                /* I just nuked the last connection on shutdown; wake up
+                 * everyone so they can exit. */
+                wake_up_all(&kibnal_data.kib_sched_waitq);
+                wake_up_all(&kibnal_data.kib_connd_waitq);
+        }
+}
+
+void
+kibnal_put_conn (kib_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
+                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                atomic_read (&conn->ibc_refcount));
+
+        LASSERT (atomic_read (&conn->ibc_refcount) > 0);
+        if (!atomic_dec_and_test (&conn->ibc_refcount))
+                return;
+
+        /* must disconnect before dropping the final ref */
+        LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
+
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+        list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+}
+
+static int
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+        kib_conn_t         *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                count++;
+                kibnal_close_conn_locked (conn, why);
+        }
+
+        return (count);
+}
+
+int
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
+{
+        kib_conn_t         *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                if (conn->ibc_incarnation == incarnation)
+                        continue;
+
+                CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
+                       peer->ibp_nid, conn->ibc_incarnation, incarnation);
+                
+                count++;
+                kibnal_close_conn_locked (conn, -ESTALE);
+        }
+
+        return (count);
+}
+
+static int
+kibnal_close_matching_conns (ptl_nid_t nid)
+{
+        unsigned long       flags;
+        kib_peer_t         *peer;
+        struct list_head   *ptmp;
+        struct list_head   *pnxt;
+        int                 lo;
+        int                 hi;
+        int                 i;
+        int                 count = 0;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        if (nid != PTL_NID_ANY)
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+        else {
+                lo = 0;
+                hi = kibnal_data.kib_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+                                continue;
+
+                        count += kibnal_close_peer_conns_locked (peer, 0);
+                }
+        }
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        /* wildcards always succeed */
+        if (nid == PTL_NID_ANY)
+                return (0);
+        
+        return (count == 0 ? -ENOENT : 0);
+}
+
+static int
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
+{
+        int rc = -EINVAL;
+        ENTRY;
+
+        LASSERT (pcfg != NULL);
+
+        switch(pcfg->pcfg_command) {
+        case NAL_CMD_GET_PEER: {
+                ptl_nid_t   nid = 0;
+                int         share_count = 0;
+
+                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                                          &nid, &share_count);
+                pcfg->pcfg_nid   = nid;
+                pcfg->pcfg_size  = 0;
+                pcfg->pcfg_id    = 0;
+                pcfg->pcfg_misc  = 0;
+                pcfg->pcfg_count = 0;
+                pcfg->pcfg_wait  = share_count;
+                break;
+        }
+        case NAL_CMD_ADD_PEER: {
+                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
+                break;
+        }
+        case NAL_CMD_DEL_PEER: {
+                rc = kibnal_del_peer (pcfg->pcfg_nid, 
+                                       /* flags == single_share */
+                                       pcfg->pcfg_flags != 0);
+                break;
+        }
+        case NAL_CMD_GET_CONN: {
+                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+
+                if (conn == NULL)
+                        rc = -ENOENT;
+                else {
+                        rc = 0;
+                        pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
+                        pcfg->pcfg_id    = 0;
+                        pcfg->pcfg_misc  = 0;
+                        pcfg->pcfg_flags = 0;
+                        kibnal_put_conn (conn);
+                }
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                if (pcfg->pcfg_nid == PTL_NID_ANY)
+                        rc = -EINVAL;
+                else
+                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
+                break;
+        }
+        }
+
+        RETURN(rc);
+}
+
+void
+kibnal_free_pages (kib_pages_t *p)
+{
+        int     npages = p->ibp_npages;
+        int     rc;
+        int     i;
+        
+        if (p->ibp_mapped) {
+                rc = iibt_deregister_memory(p->ibp_handle);
+                if (rc != 0)
+                        CERROR ("Deregister error: %d\n", rc);
+        }
+        
+        for (i = 0; i < npages; i++)
+                if (p->ibp_pages[i] != NULL)
+                        __free_page(p->ibp_pages[i]);
+        
+        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
+{
+        kib_pages_t                *p;
+        __u64                      *phys_pages;
+        int                         i;
+        FSTATUS                     frc;
+        IB_ACCESS_CONTROL           access;
+
+        memset(&access, 0, sizeof(access));
+        access.s.MWBindable = 1;
+        access.s.LocalWrite = 1;
+        access.s.RdmaRead = 1;
+        access.s.RdmaWrite = 1;
+
+        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+        if (p == NULL) {
+                CERROR ("Can't allocate buffer %d\n", npages);
+                return (-ENOMEM);
+        }
+
+        memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+        p->ibp_npages = npages;
+        
+        for (i = 0; i < npages; i++) {
+                p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+                if (p->ibp_pages[i] == NULL) {
+                        CERROR ("Can't allocate page %d of %d\n", i, npages);
+                        kibnal_free_pages(p);
+                        return (-ENOMEM);
+                }
+        }
+
+        if (kibnal_whole_mem())
+                goto out;
+
+        PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
+        if (phys_pages == NULL) {
+                CERROR ("Can't allocate physarray for %d pages\n", npages);
+                /* XXX free ibp_pages? */
+                kibnal_free_pages(p);
+                return (-ENOMEM);
+        }
+
+        /* if we were using the _contig_ registration variant we would have
+         * an array of PhysAddr/Length pairs, but the discontiguous variant
+         * just takes the PhysAddr */
+        for (i = 0; i < npages; i++)
+                phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
+
+        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+                                            0,          /* requested vaddr */
+                                            phys_pages, npages,
+                                            0,          /* offset */
+                                            kibnal_data.kib_pd,
+                                            access,
+                                            &p->ibp_handle, &p->ibp_vaddr,
+                                            &p->ibp_lkey, &p->ibp_rkey);
+        
+        PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
+        
+        if (frc != FSUCCESS) {
+                CERROR ("Error %d mapping %d pages\n", frc, npages);
+                kibnal_free_pages(p);
+                return (-ENOMEM);
+        }
+
+        CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
+                      "lkey %x rkey %x\n", npages, p->ibp_handle,
+                      p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
+        
+        p->ibp_mapped = 1;
+out:
+        *pp = p;
+        return (0);
+}
+
+static int
+kibnal_setup_tx_descs (void)
+{
+        int           ipage = 0;
+        int           page_offset = 0;
+        __u64         vaddr;
+        __u64         vaddr_base;
+        struct page  *page;
+        kib_tx_t     *tx;
+        int           i;
+        int           rc;
+
+        /* pre-mapped messages are not bigger than 1 page */
+        LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+
+        /* No fancy arithmetic when we do the buffer calculations */
+        LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+
+        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
+                                0);
+        if (rc != 0)
+                return (rc);
+
+        /* ignored for the whole_mem case */
+        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
+
+        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+                page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+                tx = &kibnal_data.kib_tx_descs[i];
+
+                memset (tx, 0, sizeof(*tx));    /* zero flags etc */
+                
+                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
+                                            page_offset);
+
+                if (kibnal_whole_mem()) 
+                        tx->tx_vaddr = kibnal_page2phys(page) + 
+                                       page_offset + 
+                                       kibnal_data.kib_md.md_addr;
+                else
+                        tx->tx_vaddr = vaddr;
+
+                tx->tx_isnblk = (i >= IBNAL_NTX);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+
+                CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
+                       i, tx, tx->tx_msg, tx->tx_vaddr);
+
+                if (tx->tx_isnblk)
+                        list_add (&tx->tx_list, 
+                                  &kibnal_data.kib_idle_nblk_txs);
+                else
+                        list_add (&tx->tx_list, 
+                                  &kibnal_data.kib_idle_txs);
+
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+
+                page_offset += IBNAL_MSG_SIZE;
+                LASSERT (page_offset <= PAGE_SIZE);
+
+                if (page_offset == PAGE_SIZE) {
+                        page_offset = 0;
+                        ipage++;
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+                }
+        }
+        
+        return (0);
+}
+
+static void
+kibnal_api_shutdown (nal_t *nal)
+{
+        int   i;
+        int   rc;
+
+        if (nal->nal_refct != 0) {
+                /* This module got the first ref */
+                PORTAL_MODULE_UNUSE;
+                return;
+        }
+
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        LASSERT(nal == &kibnal_api);
+
+        switch (kibnal_data.kib_init) {
+        default:
+                CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
+                LBUG();
+
+        case IBNAL_INIT_ALL:
+                /* stop calls to nal_cmd */
+                libcfs_nal_cmd_unregister(IIBNAL);
+                /* No new peers */
+
+                /* resetting my NID to unadvertises me, removes my
+                 * listener and nukes all current peers */
+                kibnal_set_mynid (PTL_NID_ANY);
+
+                /* Wait for all peer state to clean up (crazy) */
+                i = 2;
+                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "waiting for %d peers to disconnect (can take a few seconds)\n",
+                               atomic_read (&kibnal_data.kib_npeers));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+                /* fall through */
+
+        case IBNAL_INIT_CQ:
+                rc = iibt_cq_destroy(kibnal_data.kib_cq);
+                if (rc != 0)
+                        CERROR ("Destroy CQ error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_TXD:
+                kibnal_free_pages (kibnal_data.kib_tx_pages);
+                /* fall through */
+
+        case IBNAL_INIT_MR:
+                if (kibnal_data.kib_md.md_handle != NULL) {
+                        rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
+                        if (rc != FSUCCESS)
+                                CERROR ("Deregister memory: %d\n", rc);
+                }
+                /* fall through */
+
+#if IBNAL_FMR
+        case IBNAL_INIT_FMR:
+                rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
+                if (rc != 0)
+                        CERROR ("Destroy FMR pool error: %d\n", rc);
+                /* fall through */
+#endif
+        case IBNAL_INIT_PD:
+                rc = iibt_pd_free(kibnal_data.kib_pd);
+                if (rc != 0)
+                        CERROR ("Destroy PD error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_SD:
+                rc = iibt_sd_deregister(kibnal_data.kib_sd);
+                if (rc != 0)
+                        CERROR ("Deregister SD error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_PORT:
+                /* XXX ??? */
+                /* fall through */
+
+        case IBNAL_INIT_PORTATTRS:
+                PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
+                            kibnal_data.kib_hca_attrs.PortAttributesListSize);
+                /* fall through */
+
+        case IBNAL_INIT_HCA:
+                rc = iibt_close_hca(kibnal_data.kib_hca);
+                if (rc != 0)
+                        CERROR ("Close HCA  error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_LIB:
+                lib_fini(&kibnal_lib);
+                /* fall through */
+
+        case IBNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all peers
+                 * have been closed so all lists must be empty */
+                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (kibnal_data.kib_peers != NULL);
+                for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                        LASSERT (list_empty (&kibnal_data.kib_peers[i]));
+                }
+                LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+                LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+                LASSERT (list_empty (&kibnal_data.kib_connd_peers));
+
+                /* flag threads to terminate; wake and wait for them to die */
+                kibnal_data.kib_shutdown = 1;
+                wake_up_all (&kibnal_data.kib_sched_waitq);
+                wake_up_all (&kibnal_data.kib_connd_waitq);
+
+                i = 2;
+                while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "Waiting for %d threads to terminate\n",
+                               atomic_read (&kibnal_data.kib_nthreads));
+                        set_current_state (TASK_INTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+                /* fall through */
+                
+        case IBNAL_INIT_NOTHING:
+                break;
+        }
+
+        if (kibnal_data.kib_tx_descs != NULL)
+                PORTAL_FREE (kibnal_data.kib_tx_descs,
+                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
+
+        if (kibnal_data.kib_peers != NULL)
+                PORTAL_FREE (kibnal_data.kib_peers,
+                             sizeof (struct list_head) * 
+                             kibnal_data.kib_peer_hash_size);
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+        printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+
+        kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+}
+
+#define roundup_power(val, power) \
+        ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
+
+/* this isn't very portable or sturdy in the face of funny mem/bus configs */
+static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
+{
+        struct sysinfo si;
+        __u64 ret;
+
+        /* XXX we don't bother with first-gen cards */
+        if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
+                return 0ULL;
+
+        si_meminfo(&si);
+        ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
+        return roundup_power(ret, 128 * 1024 * 1024);
+} 
+#undef roundup_power
+
+static int
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+                     ptl_ni_limits_t *requested_limits,
+                     ptl_ni_limits_t *actual_limits)
+{
+        ptl_process_id_t    process_id;
+        int                 pkmem = atomic_read(&portal_kmemory);
+        IB_PORT_ATTRIBUTES *pattr;
+        FSTATUS             frc;
+        int                 rc;
+        int                 n;
+        int                 i;
+
+        LASSERT (nal == &kibnal_api);
+
+        if (nal->nal_refct != 0) {
+                if (actual_limits != NULL)
+                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
+                /* This module got the first ref */
+                PORTAL_MODULE_USE;
+                return (PTL_OK);
+        }
+
+        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+
+        frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, 
+                                       &kibnal_data.kib_interfaces);
+        if (frc != FSUCCESS) {
+                CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
+                        frc);
+                return -ENOSYS;
+        }
+
+        init_MUTEX (&kibnal_data.kib_nid_mutex);
+        init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+        kibnal_data.kib_nid = PTL_NID_ANY;
+
+        rwlock_init(&kibnal_data.kib_global_lock);
+
+        kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+        PORTAL_ALLOC (kibnal_data.kib_peers,
+                      sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+        if (kibnal_data.kib_peers == NULL) {
+                goto failed;
+        }
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+                INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+        spin_lock_init (&kibnal_data.kib_connd_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+        init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+        spin_lock_init (&kibnal_data.kib_sched_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+        init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+        spin_lock_init (&kibnal_data.kib_tx_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        if (kibnal_data.kib_tx_descs == NULL) {
+                CERROR ("Can't allocate tx descs\n");
+                goto failed;
+        }
+
+        /* lists/ptrs/locks initialised */
+        kibnal_data.kib_init = IBNAL_INIT_DATA;
+        /*****************************************************/
+
+        process_id.pid = 0;
+        process_id.nid = kibnal_data.kib_nid;
+        
+        rc = lib_init(&kibnal_lib, nal, process_id,
+                      requested_limits, actual_limits);
+        if (rc != PTL_OK) {
+                CERROR("lib_init failed: error %d\n", rc);
+                goto failed;
+        }
+
+        /* lib interface initialised */
+        kibnal_data.kib_init = IBNAL_INIT_LIB;
+        /*****************************************************/
+
+        for (i = 0; i < IBNAL_N_SCHED; i++) {
+                rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
+                if (rc != 0) {
+                        CERROR("Can't spawn iibnal scheduler[%d]: %d\n",
+                               i, rc);
+                        goto failed;
+                }
+        }
+
+        rc = kibnal_thread_start (kibnal_connd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't spawn iibnal connd: %d\n", rc);
+                goto failed;
+        }
+
+        n = sizeof(kibnal_data.kib_hca_guids) /
+            sizeof(kibnal_data.kib_hca_guids[0]);
+        frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't get channel adapter guids: %d\n", frc);
+                goto failed;
+        }
+        if (n == 0) {
+                CERROR ("No channel adapters found\n");
+                goto failed;
+        }
+
+        /* Infinicon has per-HCA rather than per CQ completion handlers */
+        frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
+                            kibnal_ca_callback,
+                            kibnal_ca_async_callback,
+                            &kibnal_data.kib_hca,
+                            &kibnal_data.kib_hca);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't open CA[0]: %d\n", frc);
+                goto failed;
+        }
+        
+        /* Channel Adapter opened */
+        kibnal_data.kib_init = IBNAL_INIT_HCA;
+        /*****************************************************/
+
+        kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
+        kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
+        frc = iibt_query_hca(kibnal_data.kib_hca,
+                             &kibnal_data.kib_hca_attrs, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't size port attrs: %d\n", frc);
+                goto failed;
+        }
+        
+        PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
+                     kibnal_data.kib_hca_attrs.PortAttributesListSize);
+        if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
+                goto failed;
+
+        /* Port attrs allocated */
+        kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
+        /*****************************************************/
+        
+        frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
+                             NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't get port attrs for CA 0: %d\n", frc);
+                goto failed;
+        }
+
+        for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
+             pattr != NULL;
+             i++, pattr = pattr->Next) {
+                switch (pattr->PortState) {
+                default:
+                        CERROR("Unexpected port[%d] state %d\n",
+                               i, pattr->PortState);
+                        continue;
+                case PortStateDown:
+                        CDEBUG(D_NET, "port[%d] Down\n", i);
+                        continue;
+                case PortStateInit:
+                        CDEBUG(D_NET, "port[%d] Init\n", i);
+                        continue;
+                case PortStateArmed:
+                        CDEBUG(D_NET, "port[%d] Armed\n", i);
+                        continue;
+                        
+                case PortStateActive:
+                        CDEBUG(D_NET, "port[%d] Active\n", i);
+                        kibnal_data.kib_port = i;
+                        kibnal_data.kib_port_guid = pattr->GUID;
+                        kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
+                        break;
+                }
+                break;
+        }
+
+        if (pattr == NULL) {
+                CERROR ("Can't find an active port\n");
+                goto failed;
+        }
+
+        CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
+        
+        /* Active port found */
+        kibnal_data.kib_init = IBNAL_INIT_PORT;
+        /*****************************************************/
+
+        frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't register with SD: %d\n", frc);
+                goto failed;
+        }
+        
+        /* Registered with SD OK */
+        kibnal_data.kib_init = IBNAL_INIT_SD;
+        /*****************************************************/
+
+        frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't create PD: %d\n", rc);
+                goto failed;
+        }
+        
+        /* flag PD initialised */
+        kibnal_data.kib_init = IBNAL_INIT_PD;
+        /*****************************************************/
+
+#if IBNAL_FMR
+        {
+                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
+                struct ib_fmr_pool_param params = {
+                        .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
+                        .access            = (IB_ACCESS_LOCAL_WRITE |
+                                              IB_ACCESS_REMOTE_WRITE |
+                                              IB_ACCESS_REMOTE_READ),
+                        .pool_size         = pool_size,
+                        .dirty_watermark   = (pool_size * 3)/4,
+                        .flush_function    = NULL,
+                        .flush_arg         = NULL,
+                        .cache             = 1,
+                };
+                rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
+                                        &kibnal_data.kib_fmr_pool);
+                if (rc != 0) {
+                        CERROR ("Can't create FMR pool size %d: %d\n", 
+                                pool_size, rc);
+                        goto failed;
+                }
+        }
+
+        /* flag FMR pool initialised */
+        kibnal_data.kib_init = IBNAL_INIT_FMR;
+#endif
+        /*****************************************************/
+        if (IBNAL_WHOLE_MEM) {
+                IB_MR_PHYS_BUFFER phys;
+                IB_ACCESS_CONTROL access;
+                kib_md_t *md = &kibnal_data.kib_md;
+
+                memset(&access, 0, sizeof(access));
+                access.s.MWBindable = 1;
+                access.s.LocalWrite = 1;
+                access.s.RdmaRead = 1;
+                access.s.RdmaWrite = 1;
+
+                phys.PhysAddr = 0;
+                phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
+                if (phys.Length == 0) {
+                        CERROR ("couldn't determine the end of phys mem\n");
+                        goto failed;
+                }
+       
+                rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
+                                                          0,
+                                                          &phys, 1,
+                                                          0,
+                                                          kibnal_data.kib_pd,
+                                                          access,
+                                                          &md->md_handle,
+                                                          &md->md_addr,
+                                                          &md->md_lkey,
+                                                          &md->md_rkey);
+                if (rc != FSUCCESS) {
+                        CERROR("registering physical memory failed: %d\n", 
+                               rc);
+                        CERROR("falling back to registration per-rdma\n");
+                        md->md_handle = NULL;
+                } else {
+                        CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
+                               phys.Length);
+                        kibnal_data.kib_init = IBNAL_INIT_MR;
+                }
+        }
+
+        /*****************************************************/
+
+        rc = kibnal_setup_tx_descs();
+        if (rc != 0) {
+                CERROR ("Can't register tx descs: %d\n", rc);
+                goto failed;
+        }
+        
+        /* flag TX descs initialised */
+        kibnal_data.kib_init = IBNAL_INIT_TXD;
+        /*****************************************************/
+        
+        {
+                uint32 nentries;
+
+                frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
+                                     &kibnal_data.kib_cq, &kibnal_data.kib_cq,
+                                     &nentries);
+                if (frc != FSUCCESS) {
+                        CERROR ("Can't create RX CQ: %d\n", frc);
+                        goto failed;
+                }
+
+                /* flag CQ initialised */
+                kibnal_data.kib_init = IBNAL_INIT_CQ;
+
+                if (nentries < IBNAL_CQ_ENTRIES) {
+                        CERROR ("CQ only has %d entries, need %d\n", 
+                                nentries, IBNAL_CQ_ENTRIES);
+                        goto failed;
+                }
+
+                rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
+                if (rc != 0) {
+                        CERROR ("Failed to re-arm completion queue: %d\n", rc);
+                        goto failed;
+                }
+        }
+        
+        /*****************************************************/
+
+        rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                goto failed;
+        }
+
+        /* flag everything initialised */
+        kibnal_data.kib_init = IBNAL_INIT_ALL;
+        /*****************************************************/
+
+        printk(KERN_INFO "Lustre: Infinicon IB NAL loaded "
+               "(initial mem %d)\n", pkmem);
+
+        return (PTL_OK);
+
+ failed:
+        kibnal_api_shutdown (&kibnal_api);    
+        return (PTL_FAIL);
+}
+
+void __exit
+kibnal_module_fini (void)
+{
+#ifdef CONFIG_SYSCTL
+        if (kibnal_tunables.kib_sysctl != NULL)
+                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
+#endif
+        PtlNIFini(kibnal_ni);
+
+        ptl_unregister_nal(IIBNAL);
+}
+
+int __init
+kibnal_module_init (void)
+{
+        int    rc;
+
+        if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
+                CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
+                return -EINVAL;
+        }
+
+        /* the following must be sizeof(int) for proc_dointvec() */
+        if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
+                CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
+                return -EINVAL;
+        }
+
+        kibnal_api.nal_ni_init = kibnal_api_startup;
+        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
+
+        /* Initialise dynamic tunables to defaults once only */
+        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+
+        rc = ptl_register_nal(IIBNAL, &kibnal_api);
+        if (rc != PTL_OK) {
+                CERROR("Can't register IBNAL: %d\n", rc);
+                return (-ENOMEM);               /* or something... */
+        }
+
+        /* Pure gateways want the NAL started up at module load time... */
+        rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni);
+        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+                ptl_unregister_nal(IIBNAL);
+                return (-ENODEV);
+        }
+        
+#ifdef CONFIG_SYSCTL
+        /* Press on regardless even if registering sysctl doesn't work */
+        kibnal_tunables.kib_sysctl = 
+                register_sysctl_table (kibnal_top_ctl_table, 0);
+#endif
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
+
diff --git a/lnet/klnds/iiblnd/iiblnd.h b/lnet/klnds/iiblnd/iiblnd.h
new file mode 100644 (file)
index 0000000..0a25a9a
--- /dev/null
@@ -0,0 +1,892 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#define DEBUG_SUBSYSTEM S_IBNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/nal.h>
+
+#include <linux/iba/ibt.h>
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+                + __GNUC_MINOR__ * 100 \
+                + __GNUC_PATCHLEVEL__)
+
+/* Test for GCC > 3.2.2 */
+#if GCC_VERSION <= 30202
+/* GCC 3.2.2, and presumably several versions before it, will
+ * miscompile this driver. See
+ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */
+#error Invalid GCC version. Must use GCC >= 3.2.3
+#endif
+
+#define IBNAL_SERVICE_NAME   "iibnal"
+#define IBNAL_SERVICE_NUMBER 0x11b9a1
+
+#if CONFIG_SMP
+# define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
+#else
+# define IBNAL_N_SCHED      1                   /* # schedulers */
+#endif
+
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
+
+#define IBNAL_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
+
+#define IBNAL_MSG_QUEUE_SIZE   8                /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 7                /* when to eagerly return credits */
+/* 7 indicates infinite retry attempts, Infinicon recommended 5 */
+#define IBNAL_RETRY            5                /* # times to retry */
+#define IBNAL_RNR_RETRY        5                /*  */
+#define IBNAL_CM_RETRY         5                /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL     1
+#define IBNAL_ACK_TIMEOUT       20              /* supposedly 4 secs */
+
+#define IBNAL_NTX             64                /* # tx descs */
+/* this had to be dropped down so that we only register < 255 pages per
+ * region.  this will change if we register all memory. */
+#define IBNAL_NTX_NBLK        128               /* # reserved tx descs */
+
+#define IBNAL_PEER_HASH_SIZE  101               /* # peer lists */
+
+#define IBNAL_RESCHED         100               /* # scheduler loops before reschedule */
+
+#define IBNAL_CONCURRENT_PEERS 1000             /* # nodes all talking at once to me */
+
+/* default vals for runtime tunables */
+#define IBNAL_IO_TIMEOUT      50                /* default comms timeout (seconds) */
+
+/************************/
+/* derived constants... */
+
+/* TX messages (shared by all connections) */
+#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1)
+
+/* RX messages (per connection) */
+#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+
+/* we may have up to 2 completions per transmit +
+   1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
+                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
+
+#define IBNAL_RDMA_BASE  0x0eeb0000
+#define IBNAL_FMR        0
+#define IBNAL_WHOLE_MEM  1
+#define IBNAL_CKSUM      0
+//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
+
+/* XXX I have no idea. */
+#define IBNAL_STARTING_PSN 1
+
+typedef struct 
+{
+        int               kib_io_timeout;       /* comms timeout (seconds) */
+        struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+} kib_tunables_t;
+
+/* some of these have specific types in the stack that just map back
+ * to the uFOO types, like IB_{L,R}_KEY. */
+typedef struct
+{
+        int               ibp_npages;           /* # pages */
+        int               ibp_mapped;           /* mapped? */
+        __u64             ibp_vaddr;            /* mapped region vaddr */
+        __u32             ibp_lkey;             /* mapped region lkey */
+        __u32             ibp_rkey;             /* mapped region rkey */
+        IB_HANDLE         ibp_handle;           /* mapped region handle */
+        struct page      *ibp_pages[0];
+} kib_pages_t;
+
+typedef struct
+{
+        IB_HANDLE         md_handle;
+        __u32             md_lkey;
+        __u32             md_rkey;
+        __u64             md_addr;
+} kib_md_t __attribute__((packed));
+        
+typedef struct 
+{
+        int               kib_init;             /* initialisation state */
+        __u64             kib_incarnation;      /* which one am I */
+        int               kib_shutdown;         /* shut down? */
+        atomic_t          kib_nthreads;         /* # live threads */
+
+        __u64             kib_service_id;       /* service number I listen on */
+        __u64             kib_port_guid;        /* my GUID (lo 64 of GID)*/
+        __u16             kib_port_pkey;        /* my pkey, whatever that is */
+        ptl_nid_t         kib_nid;              /* my NID */
+        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
+        struct semaphore  kib_nid_signal;       /* signal completion */
+        IB_HANDLE         kib_cep;              /* connection end point */
+
+        rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
+
+        struct list_head *kib_peers;            /* hash table of all my known peers */
+        int               kib_peer_hash_size;   /* size of kib_peers */
+        atomic_t          kib_npeers;           /* # peers extant */
+        atomic_t          kib_nconns;           /* # connections extant */
+
+        struct list_head  kib_connd_conns;      /* connections to progress */
+        struct list_head  kib_connd_peers;      /* peers waiting for a connection */
+        wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
+        unsigned long     kib_connd_waketime;   /* when connd will wake */
+        spinlock_t        kib_connd_lock;       /* serialise */
+
+        wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
+        struct list_head  kib_sched_txq;        /* tx requiring attention */
+        struct list_head  kib_sched_rxq;        /* rx requiring attention */
+        spinlock_t        kib_sched_lock;       /* serialise */
+        
+        struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
+        kib_pages_t      *kib_tx_pages;         /* premapped tx msg pages */
+
+        struct list_head  kib_idle_txs;         /* idle tx descriptors */
+        struct list_head  kib_idle_nblk_txs;    /* idle reserved tx descriptors */
+        wait_queue_head_t kib_idle_tx_waitq;    /* block here for tx descriptor */
+        __u64             kib_next_tx_cookie;   /* RDMA completion cookie */
+        spinlock_t        kib_tx_lock;          /* serialise */
+        
+        IB_HANDLE         kib_hca;              /* The HCA */
+        int               kib_port;             /* port on the device */
+        IB_HANDLE         kib_pd;               /* protection domain */
+        IB_HANDLE         kib_sd;               /* SD handle */
+        IB_HANDLE         kib_cq;               /* completion queue */
+        kib_md_t          kib_md;               /* full-mem registration */
+
+        void             *kib_listen_handle;    /* where I listen for connections */
+
+        IBT_INTERFACE_UNION kib_interfaces;     /* The Infinicon IBT interface */
+
+        uint64              kib_hca_guids[8];   /* all the HCA guids */
+        IB_CA_ATTRIBUTES    kib_hca_attrs;      /* where to get HCA attrs */
+        FABRIC_OPERATION_DATA kib_fabopdata;    /* (un)advertise service record */
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING         0
+#define IBNAL_INIT_DATA            1
+#define IBNAL_INIT_LIB             2
+#define IBNAL_INIT_HCA             3
+#define IBNAL_INIT_PORTATTRS       4
+#define IBNAL_INIT_PORT            5
+#define IBNAL_INIT_SD              6
+#define IBNAL_INIT_PD              7
+#define IBNAL_INIT_FMR             8
+#define IBNAL_INIT_MR              9
+#define IBNAL_INIT_TXD             10 
+#define IBNAL_INIT_CQ              11 
+#define IBNAL_INIT_ALL             12 
+
+/************************************************************************
+ * Wire message structs.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD
+ * private data and SM service info), is LE on the wire.
+ */
+
+/* also kib_md_t above */
+
+typedef struct
+{
+        __u32                 rd_key;           /* remote key */
+        __u32                 rd_nob;           /* # of bytes */
+        __u64                 rd_addr;          /* remote io vaddr */
+} kib_rdma_desc_t __attribute__((packed));
+
+typedef struct
+{
+        ptl_hdr_t         ibim_hdr;             /* portals header */
+        char              ibim_payload[0];      /* piggy-backed payload */
+} kib_immediate_msg_t __attribute__((packed));
+
+/* these arrays serve two purposes during rdma.  they are built on the passive
+ * side and sent to the active side as remote arguments.  On the active side
+ * the descs are used as a data structure on the way to local gather items. 
+ * the different roles result in split local/remote meaning of desc->rd_key */
+typedef struct
+{
+        ptl_hdr_t         ibrm_hdr;             /* portals header */
+        __u64             ibrm_cookie;          /* opaque completion cookie */
+        __u32             ibrm_num_descs;       /* how many descs */
+        kib_rdma_desc_t   ibrm_desc[0];         /* where to suck/blow */
+} kib_rdma_msg_t __attribute__((packed));
+
+#define kib_rdma_msg_len(num_descs) \
+        offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs])
+
+typedef struct
+{
+        __u64             ibcm_cookie;          /* opaque completion cookie */
+        __u32             ibcm_status;          /* completion status */
+} kib_completion_msg_t __attribute__((packed));
+
+typedef struct
+{
+        __u32              ibm_magic;           /* I'm an openibnal message */
+        __u16              ibm_version;         /* this is my version number */
+        __u8               ibm_type;            /* msg type */
+        __u8               ibm_credits;         /* returned credits */
+#if IBNAL_CKSUM
+        __u32              ibm_nob;
+        __u32              ibm_cksum;
+#endif
+        union {
+                kib_immediate_msg_t   immediate;
+                kib_rdma_msg_t        rdma;
+                kib_completion_msg_t  completion;
+        } ibm_u __attribute__((packed));
+} kib_msg_t __attribute__((packed));
+
+#define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
+#define IBNAL_MSG_VERSION              1        /* current protocol version */
+
+#define IBNAL_MSG_NOOP              0xd0        /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE         0xd1        /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA          0xd2        /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE          0xd3        /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA          0xd4        /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE          0xd5        /* signal GET rdma completion */
+
+/***********************************************************************/
+
+typedef struct kib_rx                           /* receive message */
+{
+        struct list_head          rx_list;      /* queue for attention */
+        struct kib_conn          *rx_conn;      /* owning conn */
+        int                       rx_rdma;      /* RDMA completion posted? */
+        int                       rx_posted;    /* posted? */
+        __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+        kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
+        IB_WORK_REQ               rx_wrq;
+        IB_LOCAL_DATASEGMENT      rx_gl;        /* and it's memory */
+} kib_rx_t;
+
+typedef struct kib_tx                           /* transmit message */
+{
+        struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+        int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
+        struct kib_conn          *tx_conn;      /* owning conn */
+        int                       tx_mapped;    /* mapped for RDMA? */
+        int                       tx_sending;   /* # tx callbacks outstanding */
+        int                       tx_status;    /* completion status */
+        unsigned long             tx_deadline;  /* completion deadline */
+        int                       tx_passive_rdma; /* peer sucks/blows */
+        int                       tx_passive_rdma_wait; /* waiting for peer to complete */
+        __u64                     tx_passive_rdma_cookie; /* completion cookie */
+        lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
+        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
+        __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+        kib_msg_t                *tx_msg;       /* pre-mapped buffer (host vaddr) */
+        int                       tx_nsp;       /* # send work items */
+        IB_WORK_REQ               tx_wrq[IBNAL_TX_MAX_SG];    /* send work items... */
+        IB_LOCAL_DATASEGMENT      tx_gl[IBNAL_TX_MAX_SG];     /* ...and their memory */
+} kib_tx_t;
+
+#define KIB_TX_UNMAPPED       0
+#define KIB_TX_MAPPED         1
+#define KIB_TX_MAPPED_FMR     2
+
+typedef struct kib_wire_connreq
+{
+        __u32        wcr_magic;                 /* I'm an openibnal connreq */
+        __u16        wcr_version;               /* this is my version number */
+        __u16        wcr_queue_depth;           /* this is my receive queue size */
+        __u64        wcr_nid;                   /* peer's NID */
+        __u64        wcr_incarnation;           /* peer's incarnation */
+} kib_wire_connreq_t;
+
+typedef struct kib_gid
+{
+        __u64   hi, lo;
+} kib_gid_t;
+
+typedef struct kib_connreq
+{
+        /* connection-in-progress */
+        struct kib_conn                    *cr_conn;
+        kib_wire_connreq_t                  cr_wcr;
+        __u64                               cr_tid;
+        IB_SERVICE_RECORD                   cr_service;
+        kib_gid_t                           cr_gid;
+        IB_PATH_RECORD                      cr_path;
+        CM_REQUEST_INFO                     cr_cmreq;
+        CM_CONN_INFO                        cr_discarded;
+        CM_REJECT_INFO                      cr_rej_info;
+} kib_connreq_t;
+
+typedef struct kib_conn
+{ 
+        struct kib_peer    *ibc_peer;           /* owning peer */
+        struct list_head    ibc_list;           /* stash on peer's conn list */
+        __u64               ibc_incarnation;    /* which instance of the peer */
+        atomic_t            ibc_refcount;       /* # users */
+        int                 ibc_state;          /* what's happening */
+        atomic_t            ibc_nob;            /* # bytes buffered */
+        int                 ibc_nsends_posted;  /* # uncompleted sends */
+        int                 ibc_credits;        /* # credits I have */
+        int                 ibc_outstanding_credits; /* # credits to return */
+        int                 ibc_rcvd_disconnect;/* received discon request */
+        int                 ibc_sent_disconnect;/* sent discon request */
+        struct list_head    ibc_tx_queue;       /* send queue */
+        struct list_head    ibc_active_txs;     /* active tx awaiting completion */
+        spinlock_t          ibc_lock;           /* serialise */
+        kib_rx_t           *ibc_rxs;            /* the rx descs */
+        kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
+        IB_HANDLE           ibc_qp;             /* queue pair */
+        IB_HANDLE           ibc_cep;            /* connection ID? */
+        IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs;    /* QP attrs */
+        kib_connreq_t      *ibc_connreq;        /* connection request state */
+} kib_conn_t;
+
+#define IBNAL_CONN_INIT_NOTHING      0          /* initial state */
+#define IBNAL_CONN_INIT_QP           1          /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING        2          /* started to connect */
+#define IBNAL_CONN_ESTABLISHED       3          /* connection established */
+#define IBNAL_CONN_SEND_DREQ         4          /* to send disconnect req */
+#define IBNAL_CONN_DREQ              5          /* sent disconnect req */
+#define IBNAL_CONN_DREP              6          /* sent disconnect rep */
+#define IBNAL_CONN_DISCONNECTED      7          /* no more QP or CM traffic */
+
+#define KIB_ASSERT_CONN_STATE(conn, state) do {                         \
+        LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state);  \
+} while (0)
+
+#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do {               \
+        LASSERTF(low <= high, "%d %d\n", low, high);                    \
+        LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \
+                 "%d\n", conn->ibc_state);                              \
+} while (0)
+
+typedef struct kib_peer
+{
+        struct list_head    ibp_list;           /* stash on global peer list */
+        struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
+        ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
+        atomic_t            ibp_refcount;       /* # users */
+        int                 ibp_persistence;    /* "known" peer refs */
+        struct list_head    ibp_conns;          /* all active connections */
+        struct list_head    ibp_tx_queue;       /* msgs waiting for a conn */
+        int                 ibp_connecting;     /* connecting+accepting */
+        unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
+        unsigned long       ibp_reconnect_interval; /* exponential backoff */
+} kib_peer_t;
+
+
+extern lib_nal_t       kibnal_lib;
+extern kib_data_t      kibnal_data;
+extern kib_tunables_t  kibnal_tunables;
+
+/******************************************************************************/
+/* Infinicon IBT interface wrappers */
+#define IIBT_IF (kibnal_data.kib_interfaces.ver2)
+
+static inline FSTATUS
+iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list)
+{
+        return IIBT_IF.GetCaGuids(hca_count, hca_guid_list);
+}
+
+static inline FSTATUS
+iibt_open_hca(EUI64                    hca_guid, 
+             IB_COMPLETION_CALLBACK   completion_callback,
+             IB_ASYNC_EVENT_CALLBACK  async_event_callback,
+             void                    *arg,
+             IB_HANDLE               *handle)
+{
+        return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback,
+                                  async_event_callback, arg, handle);
+}
+
+static inline FSTATUS
+iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp)
+{
+        return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp);
+}
+
+static inline FSTATUS
+iibt_close_hca(IB_HANDLE hca_handle)
+{
+        return IIBT_IF.Vpi.CloseCA(hca_handle);
+}
+
+static inline FSTATUS
+iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle)
+{
+        return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle);
+}
+
+static inline FSTATUS
+iibt_pd_free(IB_HANDLE pd_handle)
+{
+        return IIBT_IF.Vpi.FreePD(pd_handle);
+}
+
+static inline FSTATUS
+iibt_register_physical_memory(IB_HANDLE hca_handle, 
+                              IB_VIRT_ADDR requested_io_va,
+                              void *phys_buffers, uint64 nphys_buffers,
+                              uint32 io_va_offset, IB_HANDLE pd_handle,
+                              IB_ACCESS_CONTROL access,
+                              IB_HANDLE *mem_handle, 
+                              IB_VIRT_ADDR *actual_io_va,
+                              IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+        return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va,
+                                                 phys_buffers, nphys_buffers,
+                                                 io_va_offset, pd_handle, 
+                                                 access,
+                                                 mem_handle, actual_io_va,
+                                                 lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_contig_physical_memory(IB_HANDLE hca_handle, 
+                                     IB_VIRT_ADDR requested_io_va,
+                                     IB_MR_PHYS_BUFFER *phys_buffers, 
+                                     uint64 nphys_buffers,
+                                     uint32 io_va_offset, IB_HANDLE pd_handle,
+                                     IB_ACCESS_CONTROL access,
+                                     IB_HANDLE *mem_handle, 
+                                     IB_VIRT_ADDR *actual_io_va,
+                                     IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+        return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle, 
+                                                       requested_io_va,
+                                                       phys_buffers, 
+                                                       nphys_buffers,
+                                                       io_va_offset, pd_handle, 
+                                                       access,
+                                                       mem_handle, actual_io_va,
+                                                       lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_memory(IB_HANDLE hca_handle, 
+                     void *virt_addr, unsigned int length,
+                     IB_HANDLE pd_handle,
+                     IB_ACCESS_CONTROL access,
+                     IB_HANDLE *mem_handle, 
+                     IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+        return IIBT_IF.Vpi.RegisterMemRegion(hca_handle, 
+                                             virt_addr, length,
+                                             pd_handle, 
+                                             access,
+                                             mem_handle,
+                                             lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_deregister_memory(IB_HANDLE mem_handle)
+{
+        return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle);
+}
+
+static inline FSTATUS
+iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size,
+              void *arg, IB_HANDLE *cq_handle, uint32 *actual_size)
+{
+        return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size,
+                                   arg, cq_handle, actual_size);
+}
+
+static inline FSTATUS
+iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc)
+{
+        return IIBT_IF.Vpi.PollCQ(cq_handle, wc);
+}
+
+static inline FSTATUS
+iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select)
+{
+        return IIBT_IF.Vpi.RearmCQ(cq_handle, select);
+}
+
+static inline FSTATUS
+iibt_cq_destroy(IB_HANDLE cq_handle)
+{
+        return IIBT_IF.Vpi.DestroyCQ(cq_handle);
+}
+
+static inline FSTATUS
+iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr,
+              void *arg, IB_HANDLE *cq_handle, 
+              IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+        return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle, 
+                                    query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr,
+              void **arg_ptr)
+{
+        return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr);
+}
+
+static inline FSTATUS
+iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr,
+               IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+        return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_destroy(IB_HANDLE qp_handle)
+{
+        return IIBT_IF.Vpi.DestroyQP(qp_handle);
+}
+
+static inline FSTATUS
+iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+        return IIBT_IF.Vpi.PostRecv(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+        return IIBT_IF.Vpi.PostSend(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p)
+{
+        return IIBT_IF.Sdi.Register(sd_handle, p);
+}
+
+static inline FSTATUS
+iibt_sd_deregister(IB_HANDLE sd_handle)
+{
+        return IIBT_IF.Sdi.Deregister(sd_handle);
+}
+
+static inline FSTATUS
+iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid,
+                              FABRIC_OPERATION_DATA *fod,
+                              PFABRIC_OPERATION_CALLBACK callback,
+                              COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+        return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid,
+                                               fod, callback, p, arg);
+}
+
+static inline FSTATUS
+iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid,
+                                      QUERY *qry,
+                                      PQUERY_CALLBACK callback,
+                                      COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+        return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid,
+                                                      qry, callback, p, arg);
+}
+
+static inline IB_HANDLE
+iibt_cm_create_cep(CM_CEP_TYPE type)
+{
+        return IIBT_IF.Cmi.CmCreateCEP(type);
+}
+
+static inline FSTATUS
+iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len,
+                   uint32 offset)
+{
+        return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset);
+}
+
+static inline FSTATUS
+iibt_cm_destroy_cep(IB_HANDLE cep_handle)
+{
+        return IIBT_IF.Cmi.CmDestroyCEP(cep_handle);
+}
+
+static inline FSTATUS
+iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info,
+               PFN_CM_CALLBACK callback, void *arg)
+{
+        return IIBT_IF.Cmi.CmListen(cep, info, callback, arg);
+}
+
+static inline FSTATUS
+iibt_cm_cancel(IB_HANDLE cep)
+{
+        return IIBT_IF.Cmi.CmCancel(cep);
+}
+
+static inline FSTATUS
+iibt_cm_accept(IB_HANDLE cep, 
+               CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info,
+               PFN_CM_CALLBACK callback, void *arg,
+               IB_HANDLE *new_cep)
+{
+        return IIBT_IF.Cmi.CmAccept(cep,
+                                    send_info, recv_info,
+                                    callback, arg, new_cep);
+}
+
+static inline FSTATUS
+iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej)
+{
+        return IIBT_IF.Cmi.CmReject(cep, rej);
+}
+
+static inline FSTATUS
+iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req,
+                   CM_DREPLY_INFO *reply)
+{
+        return IIBT_IF.Cmi.CmDisconnect(cep, req, reply);
+}
+
+static inline FSTATUS
+iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req,
+                 PFN_CM_CALLBACK callback, void *arg)
+{
+        return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg);
+}
+
+static inline int wrq_signals_completion(IB_WORK_REQ *wrq)
+{
+        return wrq->Req.SendRC.Options.s.SignaledCompletion == 1;
+}
+
+
+/******************************************************************************/
+
+/* these are purposely avoiding using local vars so they don't increase
+ * stack consumption. */
+
+#define kib_peer_addref(peer) do {                                      \
+        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
+                 atomic_read(&peer->ibp_refcount));                     \
+        CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n",                   \
+               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+        atomic_inc(&peer->ibp_refcount);                                \
+} while (0)
+
+#define kib_peer_decref(peer) do {                                      \
+        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
+                 atomic_read(&peer->ibp_refcount));                     \
+        CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n",                   \
+               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+        if (atomic_dec_and_test (&peer->ibp_refcount)) {                \
+                CDEBUG (D_NET, "destroying peer "LPX64" %p\n",          \
+                        peer->ibp_nid, peer);                           \
+                kibnal_destroy_peer (peer);                             \
+        }                                                               \
+} while (0)
+
+/******************************************************************************/
+
+static inline struct list_head *
+kibnal_nid2peerlist (ptl_nid_t nid) 
+{
+        unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
+        
+        return (&kibnal_data.kib_peers [hash]);
+}
+
+static inline int
+kibnal_peer_active(kib_peer_t *peer)
+{
+        /* Am I in the peer hash table? */
+        return (!list_empty(&peer->ibp_list));
+}
+
+static inline void
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+        /* CAVEAT EMPTOR: tx takes caller's ref on conn */
+
+        LASSERT (tx->tx_nsp > 0);               /* work items set up */
+        LASSERT (tx->tx_conn == NULL);          /* only set here */
+
+        tx->tx_conn = conn;
+        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
+        list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+}
+
+#define KIBNAL_SERVICE_KEY_MASK  (IB_SERVICE_RECORD_COMP_SERVICENAME |          \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_8)
+
+static inline __u64*
+kibnal_service_nid_field(IB_SERVICE_RECORD *srv)
+{
+        /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
+        return (__u64 *)srv->ServiceData8;
+}
+
+
+static inline void
+kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid)
+{
+        LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName));
+        memset (srv->ServiceName, 0, sizeof(srv->ServiceName));
+        strcpy (srv->ServiceName, IBNAL_SERVICE_NAME);
+
+        *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
+}
+
+#if 0
+static inline void
+kibnal_show_rdma_attr (kib_conn_t *conn)
+{
+        struct ib_qp_attribute qp_attr;
+        int                    rc;
+        
+        memset (&qp_attr, 0, sizeof(qp_attr));
+        rc = ib_qp_query(conn->ibc_qp, &qp_attr);
+        if (rc != 0) {
+                CERROR ("Can't get qp attrs: %d\n", rc);
+                return;
+        }
+        
+        CWARN ("RDMA CAPABILITY: write %s read %s\n",
+               (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+               (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid",
+               (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+               (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid");
+}
+#endif
+
+#if CONFIG_X86
+static inline __u64
+kibnal_page2phys (struct page *p)
+{
+        __u64 page_number = p - mem_map;
+        
+        return (page_number << PAGE_SHIFT);
+}
+#else
+# error "no page->phys"
+#endif
+
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive.  It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+        unsigned long lptr = (unsigned long)ptr;
+
+        LASSERT ((lptr & 1) == 0);
+        return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+        return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+        return (wreqid & 1) != 0;
+}
+
+static inline int
+kibnal_whole_mem(void)
+{
+        return kibnal_data.kib_md.md_handle != NULL;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_destroy_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer, 
+                                              __u64 incarnation);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg);
+
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
+
+extern void kibnal_check_sends (kib_conn_t *conn);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int  kibnal_scheduler(void *arg);
+extern int  kibnal_connd (void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern void kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status, 
+                                      kib_rx_t *rx, lib_msg_t *libmsg, 
+                                      unsigned int niov, 
+                                      struct iovec *iov, ptl_kiov_t *kiov,
+                                      size_t offset, size_t nob);
+
+void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev);
+void kibnal_ca_callback (void *ca_arg, void *cq_arg);
diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c
new file mode 100644 (file)
index 0000000..a827ba5
--- /dev/null
@@ -0,0 +1,3018 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+/*
+ *  LIB functions follow
+ *
+ */
+static void
+kibnal_schedule_tx_done (kib_tx_t *tx)
+{
+        unsigned long flags;
+
+        spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
+
+        list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+        wake_up (&kibnal_data.kib_sched_waitq);
+
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
+static void
+kibnal_tx_done (kib_tx_t *tx)
+{
+        ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
+        unsigned long    flags;
+        int              i;
+        FSTATUS          frc;
+
+        LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
+        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
+
+        switch (tx->tx_mapped) {
+        default:
+                LBUG();
+
+        case KIB_TX_UNMAPPED:
+                break;
+
+        case KIB_TX_MAPPED:
+                if (in_interrupt()) {
+                        /* can't deregister memory in IRQ context... */
+                        kibnal_schedule_tx_done(tx);
+                        return;
+                }
+                frc = iibt_deregister_memory(tx->tx_md.md_handle);
+                LASSERT (frc == FSUCCESS);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+                break;
+
+#if IBNAL_FMR
+        case KIB_TX_MAPPED_FMR:
+                if (in_interrupt() && tx->tx_status != 0) {
+                        /* can't flush FMRs in IRQ context... */
+                        kibnal_schedule_tx_done(tx);
+                        return;
+                }              
+
+                rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
+                LASSERT (rc == 0);
+
+                if (tx->tx_status != 0)
+                        ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+                break;
+#endif
+        }
+
+        for (i = 0; i < 2; i++) {
+                /* tx may have up to 2 libmsgs to finalise */
+                if (tx->tx_libmsg[i] == NULL)
+                        continue;
+
+                lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+                tx->tx_libmsg[i] = NULL;
+        }
+        
+        if (tx->tx_conn != NULL) {
+                kibnal_put_conn (tx->tx_conn);
+                tx->tx_conn = NULL;
+        }
+
+        tx->tx_nsp = 0;
+        tx->tx_passive_rdma = 0;
+        tx->tx_status = 0;
+
+        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+        if (tx->tx_isnblk) {
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
+        } else {
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+                wake_up (&kibnal_data.kib_idle_tx_waitq);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+}
+
+static kib_tx_t *
+kibnal_get_idle_tx (int may_block) 
+{
+        unsigned long  flags;
+        kib_tx_t      *tx = NULL;
+        ENTRY;
+        
+        for (;;) {
+                spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+                /* "normal" descriptor is free */
+                if (!list_empty (&kibnal_data.kib_idle_txs)) {
+                        tx = list_entry (kibnal_data.kib_idle_txs.next,
+                                         kib_tx_t, tx_list);
+                        break;
+                }
+
+                if (!may_block) {
+                        /* may dip into reserve pool */
+                        if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
+                                CERROR ("reserved tx desc pool exhausted\n");
+                                break;
+                        }
+
+                        tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+                                         kib_tx_t, tx_list);
+                        break;
+                }
+
+                /* block for idle tx */
+                spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+                wait_event (kibnal_data.kib_idle_tx_waitq,
+                            !list_empty (&kibnal_data.kib_idle_txs) ||
+                            kibnal_data.kib_shutdown);
+        }
+
+        if (tx != NULL) {
+                list_del (&tx->tx_list);
+
+                /* Allocate a new passive RDMA completion cookie.  It might
+                 * not be needed, but we've got a lock right now and we're
+                 * unlikely to wrap... */
+                tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
+
+                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+                LASSERT (tx->tx_nsp == 0);
+                LASSERT (tx->tx_sending == 0);
+                LASSERT (tx->tx_status == 0);
+                LASSERT (tx->tx_conn == NULL);
+                LASSERT (!tx->tx_passive_rdma);
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_libmsg[0] == NULL);
+                LASSERT (tx->tx_libmsg[1] == NULL);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+        
+        RETURN(tx);
+}
+
+static int
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if kibnal_get_peer (nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->libnal_ni.ni_pid.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+static void
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+{
+        struct list_head *ttmp;
+        unsigned long     flags;
+        int               idle;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (!tx->tx_passive_rdma_wait ||
+                    tx->tx_passive_rdma_cookie != cookie)
+                        continue;
+
+                CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+
+                tx->tx_status = status;
+                tx->tx_passive_rdma_wait = 0;
+                idle = (tx->tx_sending == 0);
+
+                if (idle)
+                        list_del (&tx->tx_list);
+
+                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+                /* I could be racing with tx callbacks.  It's whoever
+                 * _makes_ tx idle that frees it */
+                if (idle)
+                        kibnal_tx_done (tx);
+                return;
+        }
+                
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
+                cookie, conn->ibc_peer->ibp_nid);
+}
+
+static __u32
+kibnal_lkey(kib_pages_t *ibp)
+{
+        if (kibnal_whole_mem())
+                return kibnal_data.kib_md.md_lkey;
+
+        return ibp->ibp_lkey;
+}
+
+static void
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
+{
+        kib_conn_t   *conn = rx->rx_conn;
+        int           rc = 0;
+        unsigned long flags;
+        FSTATUS       frc;
+        ENTRY;
+
+        rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
+                .Address = rx->rx_vaddr,
+                .Length  = IBNAL_MSG_SIZE,
+                .Lkey    = kibnal_lkey(conn->ibc_rx_pages),
+        };
+
+        rx->rx_wrq = (IB_WORK_REQ) {
+                .Operation              = WROpRecv,
+                .DSListDepth            = 1,
+                .MessageLen             = IBNAL_MSG_SIZE,
+                .WorkReqId              = kibnal_ptr2wreqid(rx, 1),
+                .DSList                 = &rx->rx_gl,
+        };
+
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
+                                    IBNAL_CONN_DREP);
+        LASSERT (!rx->rx_posted);
+        rx->rx_posted = 1;
+        mb();
+
+        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+                rc = -ECONNABORTED;
+        else {
+                frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
+                if (frc != FSUCCESS) {
+                        CDEBUG(D_NET, "post failed %d\n", frc);
+                        rc = -EINVAL;
+                }
+                CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+        }
+
+        if (rc == 0) {
+                if (do_credits) {
+                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        conn->ibc_outstanding_credits++;
+                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+                        kibnal_check_sends(conn);
+                }
+                EXIT;
+                return;
+        }
+
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                CERROR ("Error posting receive -> "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, rc);
+                kibnal_close_conn (rx->rx_conn, rc);
+        } else {
+                CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, rc);
+        }
+
+        /* Drop rx's ref */
+        kibnal_put_conn (conn);
+        EXIT;
+}
+
+#if IBNAL_CKSUM
+static inline __u32 kibnal_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+        
+        return (sum);
+}
+#endif
+
+static void hexdump(char *string, void *ptr, int len)
+{
+        unsigned char *c = ptr;
+        int i;
+
+        return;
+
+        if (len < 0 || len > 2048)  {
+                printk("XXX what the hell? %d\n",len);
+                return;
+        }
+
+        printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+
+        for (i = 0; i < len;) {
+                printk("%02x",*(c++));
+                i++;
+                if (!(i & 15)) {
+                        printk("\n");
+                } else if (!(i&1)) {
+                        printk(" ");
+                }
+        }
+
+        if(len & 15) {
+                printk("\n");
+        }
+}
+
+static void
+kibnal_rx_callback (IB_WORK_COMPLETION *wc)
+{
+        kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+        int           nob = wc->Length;
+        const int     base_nob = offsetof(kib_msg_t, ibm_u);
+        int           credits;
+        int           flipped;
+        unsigned long flags;
+        __u32         i;
+#if IBNAL_CKSUM
+        __u32         msg_cksum;
+        __u32         computed_cksum;
+#endif
+
+        /* we set the QP to erroring after we've finished disconnecting, 
+         * maybe we should do so sooner. */
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, 
+                                    IBNAL_CONN_DISCONNECTED);
+
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+        LASSERT (rx->rx_posted);
+        rx->rx_posted = 0;
+        mb();
+
+        /* receives complete with error in any case after we've started
+         * disconnecting */
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+                goto failed;
+
+        if (wc->Status != WRStatusSuccess) {
+                CERROR("Rx from "LPX64" failed: %d\n", 
+                       conn->ibc_peer->ibp_nid, wc->Status);
+                goto failed;
+        }
+
+        if (nob < base_nob) {
+                CERROR ("Short rx from "LPX64": %d < expected %d\n",
+                        conn->ibc_peer->ibp_nid, nob, base_nob);
+                goto failed;
+        }
+
+        hexdump("rx", rx->rx_msg, sizeof(kib_msg_t));
+
+        /* Receiver does any byte flipping if necessary... */
+
+        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+                flipped = 0;
+        } else {
+                if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+                        CERROR ("Unrecognised magic: %08x from "LPX64"\n", 
+                                msg->ibm_magic, conn->ibc_peer->ibp_nid);
+                        goto failed;
+                }
+                flipped = 1;
+                __swab16s (&msg->ibm_version);
+                LASSERT (sizeof(msg->ibm_type) == 1);
+                LASSERT (sizeof(msg->ibm_credits) == 1);
+        }
+
+        if (msg->ibm_version != IBNAL_MSG_VERSION) {
+                CERROR ("Incompatible msg version %d (%d expected)\n",
+                        msg->ibm_version, IBNAL_MSG_VERSION);
+                goto failed;
+        }
+
+#if IBNAL_CKSUM
+        if (nob != msg->ibm_nob) {
+                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
+                goto failed;
+        }
+
+        msg_cksum = le32_to_cpu(msg->ibm_cksum);
+        msg->ibm_cksum = 0;
+        computed_cksum = kibnal_cksum (msg, nob);
+        
+        if (msg_cksum != computed_cksum) {
+                CERROR ("Checksum failure %d: (%d expected)\n",
+                        computed_cksum, msg_cksum);
+//                goto failed;
+        }
+        CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
+#endif
+
+        /* Have I received credits that will let me send? */
+        credits = msg->ibm_credits;
+        if (credits != 0) {
+                spin_lock_irqsave(&conn->ibc_lock, flags);
+                conn->ibc_credits += credits;
+                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                
+                kibnal_check_sends(conn);
+        }
+
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_NOOP:
+                kibnal_post_rx (rx, 1);
+                return;
+
+        case IBNAL_MSG_IMMEDIATE:
+                if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
+                        CERROR ("Short IMMEDIATE from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                break;
+                
+        case IBNAL_MSG_PUT_RDMA:
+        case IBNAL_MSG_GET_RDMA:
+                if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
+                        CERROR ("Short RDMA msg from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                if (flipped) 
+                        __swab32(msg->ibm_u.rdma.ibrm_num_descs);
+
+                CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
+                       msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
+
+                if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) ||
+                    (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > 
+                     min(nob, IBNAL_MSG_SIZE))) {
+                        CERROR ("num_descs %d too large\n", 
+                                msg->ibm_u.rdma.ibrm_num_descs);
+                        goto failed;
+                }
+
+                for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
+                        kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
+
+                        if (flipped) {
+                                __swab32(desc->rd_key);
+                                __swab32(desc->rd_nob);
+                                __swab64(desc->rd_addr);
+                        }
+
+                        CDEBUG(D_NET, "  key %x, " "addr "LPX64", nob %u\n",
+                               desc->rd_key, desc->rd_addr, desc->rd_nob);
+                }
+                break;
+                        
+        case IBNAL_MSG_PUT_DONE:
+        case IBNAL_MSG_GET_DONE:
+                if (nob < base_nob + sizeof (kib_completion_msg_t)) {
+                        CERROR ("Short COMPLETION msg from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                if (flipped)
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
+                
+                CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
+                       msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+                       msg->ibm_u.completion.ibcm_status);
+
+                kibnal_complete_passive_rdma (conn, 
+                                              msg->ibm_u.completion.ibcm_cookie,
+                                              msg->ibm_u.completion.ibcm_status);
+                kibnal_post_rx (rx, 1);
+                return;
+                        
+        default:
+                CERROR ("Can't parse type from "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, msg->ibm_type);
+                goto failed;
+        }
+
+        /* schedule for kibnal_rx() in thread context */
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+        
+        list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+        wake_up (&kibnal_data.kib_sched_waitq);
+        
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+        return;
+        
+ failed:
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+        kibnal_close_conn(conn, -ECONNABORTED);
+
+        /* Don't re-post rx & drop its ref on conn */
+        kibnal_put_conn(conn);
+}
+
+void
+kibnal_rx (kib_rx_t *rx)
+{
+        kib_msg_t   *msg = rx->rx_msg;
+
+        /* Clear flag so I can detect if I've sent an RDMA completion */
+        rx->rx_rdma = 0;
+
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_GET_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+                /* If the incoming get was matched, I'll have initiated the
+                 * RDMA and the completion message... */
+                if (rx->rx_rdma)
+                        break;
+
+                /* Otherwise, I'll send a failed completion now to prevent
+                 * the peer's GET blocking for the full timeout. */
+                CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+                                          rx, NULL, 0, NULL, NULL, 0, 0);
+                break;
+                
+        case IBNAL_MSG_PUT_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+                if (rx->rx_rdma)
+                        break;
+                /* This is most unusual, since even if lib_parse() didn't
+                 * match anything, it should have asked us to read (and
+                 * discard) the payload.  The portals header must be
+                 * inconsistent with this message type, so it's the
+                 * sender's fault for sending garbage and she can time
+                 * herself out... */
+                CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                break;
+
+        case IBNAL_MSG_IMMEDIATE:
+                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
+                LASSERT (!rx->rx_rdma);
+                break;
+                
+        default:
+                LBUG();
+                break;
+        }
+
+        kibnal_post_rx (rx, 1);
+}
+
+static struct page *
+kibnal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END)
+                page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+        else if (vaddr >= PKMAP_BASE &&
+                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+                page = vmalloc_to_page ((void *)vaddr);
+        /* in 2.4 ^ just walks the page tables */
+#endif
+        else
+                page = virt_to_page (vaddr);
+
+        if (!VALID_PAGE (page))
+                page = NULL;
+
+        return page;
+}
+
+static void
+kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
+                 unsigned long len, int active)
+{
+        kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
+        kib_rdma_desc_t *desc;
+
+        LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", 
+                 ibrm->ibrm_num_descs);
+
+        desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
+        if (active)
+                desc->rd_key = kibnal_data.kib_md.md_lkey;
+        else
+                desc->rd_key = kibnal_data.kib_md.md_rkey;
+        desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
+        desc->rd_addr = kibnal_page2phys(page) + page_offset +
+                        kibnal_data.kib_md.md_addr;
+
+        ibrm->ibrm_num_descs++;
+}
+
+static int
+kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
+{
+        struct page *page;
+        int page_offset, len;
+
+        while (nob > 0) {
+                page = kibnal_kvaddr_to_page(vaddr);
+                if (page == NULL)
+                        return -EFAULT;
+
+                page_offset = vaddr & (PAGE_SIZE - 1);
+                len = min(nob, (int)PAGE_SIZE - page_offset);
+                
+                kibnal_fill_ibrm(tx, page, page_offset, len, active);
+                nob -= len;
+                vaddr += len;
+        }
+        return 0;
+}
+
+static int
+kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+                 int niov, struct iovec *iov, int offset, int nob, int active)
+                 
+{
+        void   *vaddr;
+        FSTATUS frc;
+
+        LASSERT (nob > 0);
+        LASSERT (niov > 0);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+                LASSERT (niov > 0);
+        }
+
+        if (nob > iov->iov_len - offset) {
+                CERROR ("Can't map multiple vaddr fragments\n");
+                return (-EMSGSIZE);
+        }
+
+        /* our large contiguous iov could be backed by multiple physical
+         * pages. */
+        if (kibnal_whole_mem()) {
+                int rc;
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+                rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + 
+                                         offset, nob, active);
+                if (rc != 0) {
+                        CERROR ("Can't map iov: %d\n", rc);
+                        return rc;
+                }
+                return 0;
+        }
+
+        vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
+        tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+
+        frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob,
+                                   kibnal_data.kib_pd, access,
+                                   &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
+                                   &tx->tx_md.md_rkey);
+        if (frc != 0) {
+                CERROR ("Can't map vaddr %p: %d\n", vaddr, frc);
+                return -EINVAL;
+        }
+
+        tx->tx_mapped = KIB_TX_MAPPED;
+        return (0);
+}
+
+static int
+kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+                  int nkiov, ptl_kiov_t *kiov,
+                  int offset, int nob, int active)
+{
+        __u64                      *phys = NULL;
+        int                         page_offset;
+        int                         nphys;
+        int                         resid;
+        int                         phys_size = 0;
+        FSTATUS                     frc;
+        int                         i, rc = 0;
+
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
+        }
+
+        page_offset = kiov->kiov_offset + offset;
+        nphys = 1;
+
+        if (!kibnal_whole_mem()) {
+                phys_size = nkiov * sizeof (*phys);
+                PORTAL_ALLOC(phys, phys_size);
+                if (phys == NULL) {
+                        CERROR ("Can't allocate tmp phys\n");
+                        return (-ENOMEM);
+                }
+
+                phys[0] = kibnal_page2phys(kiov->kiov_page);
+        } else {
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+                kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, 
+                                 kiov->kiov_len, active);
+        }
+
+        resid = nob - (kiov->kiov_len - offset);
+
+        while (resid > 0) {
+                kiov++;
+                nkiov--;
+                LASSERT (nkiov > 0);
+
+                if (kiov->kiov_offset != 0 ||
+                    ((resid > PAGE_SIZE) && 
+                     kiov->kiov_len < PAGE_SIZE)) {
+                        /* Can't have gaps */
+                        CERROR ("Can't make payload contiguous in I/O VM:"
+                                "page %d, offset %d, len %d \n", nphys, 
+                                kiov->kiov_offset, kiov->kiov_len);
+
+                        for (i = -nphys; i < nkiov; i++) 
+                        {
+                                CERROR("kiov[%d] %p +%d for %d\n",
+                                       i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
+                        }
+                        
+                        rc = -EINVAL;
+                        goto out;
+                }
+
+                if (nphys == PTL_MD_MAX_IOV) {
+                        CERROR ("payload too big (%d)\n", nphys);
+                        rc = -EMSGSIZE;
+                        goto out;
+                }
+
+                if (!kibnal_whole_mem()) {
+                        LASSERT (nphys * sizeof (*phys) < phys_size);
+                        phys[nphys] = kibnal_page2phys(kiov->kiov_page);
+                } else {
+                        if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
+                                CERROR ("payload too big (%d)\n", nphys);
+                                rc = -EMSGSIZE;
+                                goto out;
+                        }
+                        kibnal_fill_ibrm(tx, kiov->kiov_page, 
+                                         kiov->kiov_offset, kiov->kiov_len,
+                                         active);
+                }
+
+                nphys ++;
+                resid -= PAGE_SIZE;
+        }
+
+        if (kibnal_whole_mem())
+                goto out;
+
+#if 0
+        CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
+        for (i = 0; i < nphys; i++)
+                CWARN ("   [%d] "LPX64"\n", i, phys[i]);
+#endif
+
+#if IBNAL_FMR
+#error "iibnal hasn't learned about FMR yet"
+        rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
+                                       phys, nphys,
+                                       &tx->tx_md.md_addr,
+                                       page_offset,
+                                       &tx->tx_md.md_handle.fmr,
+                                       &tx->tx_md.md_lkey,
+                                       &tx->tx_md.md_rkey);
+#else
+        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+                                            IBNAL_RDMA_BASE,
+                                            phys, nphys,
+                                            0,          /* offset */
+                                            kibnal_data.kib_pd,
+                                            access,
+                                            &tx->tx_md.md_handle,
+                                            &tx->tx_md.md_addr,
+                                            &tx->tx_md.md_lkey,
+                                            &tx->tx_md.md_rkey);
+#endif
+        if (frc == FSUCCESS) {
+                CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
+                       nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
+#if IBNAL_FMR
+                tx->tx_mapped = KIB_TX_MAPPED_FMR;
+#else
+                tx->tx_mapped = KIB_TX_MAPPED;
+#endif
+        } else {
+                CERROR ("Can't map phys: %d\n", rc);
+                rc = -EFAULT;
+        }
+
+ out:
+        if (phys != NULL)
+                PORTAL_FREE(phys, phys_size);
+        return (rc);
+}
+
+static kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
+{
+        struct list_head *tmp;
+
+        /* just return the first connection */
+        list_for_each (tmp, &peer->ibp_conns) {
+                return (list_entry(tmp, kib_conn_t, ibc_list));
+        }
+
+        return (NULL);
+}
+
+void
+kibnal_check_sends (kib_conn_t *conn)
+{
+        unsigned long   flags;
+        kib_tx_t       *tx;
+        int             rc;
+        int             i;
+        int             done;
+        int             nwork;
+        ENTRY;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
+        if (list_empty(&conn->ibc_tx_queue) &&
+            conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
+                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                
+                tx = kibnal_get_idle_tx(0);     /* don't block */
+                if (tx != NULL)
+                        kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
+
+                spin_lock_irqsave(&conn->ibc_lock, flags);
+                
+                if (tx != NULL) {
+                        atomic_inc(&conn->ibc_refcount);
+                        kibnal_queue_tx_locked(tx, conn);
+                }
+        }
+
+        while (!list_empty (&conn->ibc_tx_queue)) {
+                tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+
+                /* We rely on this for QP sizing */
+                LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+
+                LASSERT (conn->ibc_outstanding_credits >= 0);
+                LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_credits >= 0);
+                LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
+
+                /* Not on ibc_rdma_queue */
+                LASSERT (!tx->tx_passive_rdma_wait);
+
+                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
+                        GOTO(out, 0);
+
+                if (conn->ibc_credits == 0)     /* no credits */
+                        GOTO(out, 1);
+                
+                if (conn->ibc_credits == 1 &&   /* last credit reserved for */
+                    conn->ibc_outstanding_credits == 0) /* giving back credits */
+                        GOTO(out, 2);
+
+                list_del (&tx->tx_list);
+
+                if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
+                    (!list_empty(&conn->ibc_tx_queue) ||
+                     conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+                        /* redundant NOOP */
+                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                        kibnal_tx_done(tx);
+                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        continue;
+                }
+
+                tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
+                conn->ibc_outstanding_credits = 0;
+
+                conn->ibc_nsends_posted++;
+                conn->ibc_credits--;
+
+                /* we only get a tx completion for the final rdma op */ 
+                tx->tx_sending = min(tx->tx_nsp, 2);
+                tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+                list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+                tx->tx_msg->ibm_cksum = 0;
+                tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
+#endif
+                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+                /* NB the gap between removing tx from the queue and sending it
+                 * allows message re-ordering to occur */
+
+                LASSERT (tx->tx_nsp > 0);
+
+                rc = -ECONNABORTED;
+                nwork = 0;
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                        tx->tx_status = 0;
+                        /* Driver only accepts 1 item at a time */
+                        for (i = 0; i < tx->tx_nsp; i++) {
+                                hexdump("tx", tx->tx_msg, sizeof(kib_msg_t));
+                                rc = iibt_postsend(conn->ibc_qp, 
+                                                   &tx->tx_wrq[i]);
+                                if (rc != 0)
+                                        break;
+                                if (wrq_signals_completion(&tx->tx_wrq[i]))
+                                        nwork++;
+                                CDEBUG(D_NET, "posted tx wrq %p\n", 
+                                       &tx->tx_wrq[i]);
+                        }
+                }
+
+                spin_lock_irqsave (&conn->ibc_lock, flags);
+                if (rc != 0) {
+                        /* NB credits are transferred in the actual
+                         * message, which can only be the last work item */
+                        conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
+                        conn->ibc_credits++;
+                        conn->ibc_nsends_posted--;
+
+                        tx->tx_status = rc;
+                        tx->tx_passive_rdma_wait = 0;
+                        tx->tx_sending -= tx->tx_nsp - nwork;
+
+                        done = (tx->tx_sending == 0);
+                        if (done)
+                                list_del (&tx->tx_list);
+                        
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        
+                        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+                                CERROR ("Error %d posting transmit to "LPX64"\n", 
+                                        rc, conn->ibc_peer->ibp_nid);
+                        else
+                                CDEBUG (D_NET, "Error %d posting transmit to "
+                                        LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+
+                        kibnal_close_conn (conn, rc);
+
+                        if (done)
+                                kibnal_tx_done (tx);
+                        return;
+                }
+                
+        }
+
+        EXIT;
+out:
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+}
+
+static void
+kibnal_tx_callback (IB_WORK_COMPLETION *wc)
+{
+        kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+        kib_conn_t   *conn;
+        unsigned long flags;
+        int           idle;
+
+        conn = tx->tx_conn;
+        LASSERT (conn != NULL);
+        LASSERT (tx->tx_sending != 0);
+
+        spin_lock_irqsave(&conn->ibc_lock, flags);
+
+        CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
+               tx->tx_sending, tx->tx_nsp, wc->Status);
+
+        /* I could be racing with rdma completion.  Whoever makes 'tx' idle
+         * gets to free it, which also drops its ref on 'conn'.  If it's
+         * not me, then I take an extra ref on conn so it can't disappear
+         * under me. */
+
+        tx->tx_sending--;
+        idle = (tx->tx_sending == 0) &&         /* This is the final callback */
+               (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
+        if (idle)
+                list_del(&tx->tx_list);
+
+        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+               atomic_read (&conn->ibc_refcount));
+        atomic_inc (&conn->ibc_refcount);
+
+        if (tx->tx_sending == 0)
+                conn->ibc_nsends_posted--;
+
+        if (wc->Status != WRStatusSuccess &&
+            tx->tx_status == 0)
+                tx->tx_status = -ECONNABORTED;
+                
+        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+        if (idle)
+                kibnal_tx_done (tx);
+
+        if (wc->Status != WRStatusSuccess) {
+                CERROR ("Tx completion to "LPX64" failed: %d\n", 
+                        conn->ibc_peer->ibp_nid, wc->Status);
+                kibnal_close_conn (conn, -ENETDOWN);
+        } else {
+                /* can I shovel some more sends out the door? */
+                kibnal_check_sends(conn);
+        }
+
+        kibnal_put_conn (conn);
+}
+
+void 
+kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev)
+{
+        /* XXX flesh out.  this seems largely for async errors */
+        CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
+}
+
+void
+kibnal_ca_callback (void *ca_arg, void *cq_arg)
+{
+        IB_HANDLE cq = *(IB_HANDLE *)cq_arg;
+        IB_HANDLE ca = *(IB_HANDLE *)ca_arg;
+        IB_WORK_COMPLETION wc;
+        int armed = 0;
+
+        CDEBUG(D_NET, "ca %p cq %p\n", ca, cq);
+
+        for(;;) {
+                while (iibt_cq_poll(cq, &wc) == FSUCCESS) {
+                        if (kibnal_wreqid_is_rx(wc.WorkReqId))
+                                kibnal_rx_callback(&wc);
+                        else
+                                kibnal_tx_callback(&wc);
+                }
+                if (armed)
+                        return;
+                if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) {
+                        CERROR("rearm failed?\n");
+                        return;
+                }
+                armed = 1;
+        }
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
+{
+        IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp];
+        IB_WORK_REQ         *wrq = &tx->tx_wrq[tx->tx_nsp];
+        int                       fence;
+        int                       nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+
+        LASSERT (tx->tx_nsp >= 0 && 
+                 tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+        LASSERT (nob <= IBNAL_MSG_SIZE);
+        
+        tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+        tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+        tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+        tx->tx_msg->ibm_nob = nob;
+#endif
+        /* Fence the message if it's bundled with an RDMA read */
+        fence = (tx->tx_nsp > 0) &&
+                (type == IBNAL_MSG_PUT_DONE);
+
+        *gl = (IB_LOCAL_DATASEGMENT) {
+                .Address = tx->tx_vaddr,
+                .Length  = IBNAL_MSG_SIZE,
+                .Lkey    = kibnal_lkey(kibnal_data.kib_tx_pages),
+        };
+
+        wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
+        wrq->Operation      = WROpSend;
+        wrq->DSList         = gl;
+        wrq->DSListDepth    = 1;
+        wrq->MessageLen     = nob;
+        wrq->Req.SendRC.ImmediateData  = 0;
+        wrq->Req.SendRC.Options.s.SolicitedEvent         = 1;
+        wrq->Req.SendRC.Options.s.SignaledCompletion     = 1;
+        wrq->Req.SendRC.Options.s.ImmediateData          = 0;
+        wrq->Req.SendRC.Options.s.Fence                  = fence;
+
+        tx->tx_nsp++;
+}
+
+static void
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+        unsigned long         flags;
+
+        spin_lock_irqsave(&conn->ibc_lock, flags);
+
+        kibnal_queue_tx_locked (tx, conn);
+        
+        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+        
+        kibnal_check_sends(conn);
+}
+
+static void
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
+{
+        unsigned long    flags;
+        kib_peer_t      *peer;
+        kib_conn_t      *conn;
+        rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
+
+        /* If I get here, I've committed to send, so I complete the tx with
+         * failure on any problems */
+        
+        LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
+        LASSERT (tx->tx_nsp > 0);               /* work items have been set up */
+
+        read_lock (g_lock);
+        
+        peer = kibnal_find_peer_locked (nid);
+        if (peer == NULL) {
+                read_unlock (g_lock);
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+                return;
+        }
+
+        conn = kibnal_find_conn_locked (peer);
+        if (conn != NULL) {
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                read_unlock (g_lock);
+                
+                kibnal_queue_tx (tx, conn);
+                return;
+        }
+        
+        /* Making one or more connections; I'll need a write lock... */
+        read_unlock (g_lock);
+        write_lock_irqsave (g_lock, flags);
+
+        peer = kibnal_find_peer_locked (nid);
+        if (peer == NULL) {
+                write_unlock_irqrestore (g_lock, flags);
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+                return;
+        }
+
+        conn = kibnal_find_conn_locked (peer);
+        if (conn != NULL) {
+                /* Connection exists; queue message on it */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                write_unlock_irqrestore (g_lock, flags);
+                
+                kibnal_queue_tx (tx, conn);
+                return;
+        }
+
+        if (peer->ibp_connecting == 0) {
+                if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
+                        write_unlock_irqrestore (g_lock, flags);
+                        tx->tx_status = -EHOSTUNREACH;
+                        kibnal_tx_done (tx);
+                        return;
+                }
+        
+                peer->ibp_connecting = 1;
+                kib_peer_addref(peer); /* extra ref for connd */
+        
+                spin_lock (&kibnal_data.kib_connd_lock);
+        
+                list_add_tail (&peer->ibp_connd_list,
+                               &kibnal_data.kib_connd_peers);
+                wake_up (&kibnal_data.kib_connd_waitq);
+        
+                spin_unlock (&kibnal_data.kib_connd_lock);
+        }
+        
+        /* A connection is being established; queue the message... */
+        list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
+
+        write_unlock_irqrestore (g_lock, flags);
+}
+
+static ptl_err_t
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
+                            lib_msg_t *libmsg, ptl_hdr_t *hdr)
+{
+        int         nob = libmsg->md->length;
+        kib_tx_t   *tx;
+        kib_msg_t  *ibmsg;
+        int         rc;
+        IB_ACCESS_CONTROL         access = {0,};
+        
+        LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
+        LASSERT (nob > 0);
+        LASSERT (!in_interrupt());              /* Mapping could block */
+
+        access.s.MWBindable = 1;
+        access.s.LocalWrite = 1;
+        access.s.RdmaRead = 1;
+        access.s.RdmaWrite = 1;
+
+        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
+        LASSERT (tx != NULL);
+
+        if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
+                rc = kibnal_map_iov (tx, access,
+                                     libmsg->md->md_niov,
+                                     libmsg->md->md_iov.iov,
+                                     0, nob, 0);
+        else
+                rc = kibnal_map_kiov (tx, access,
+                                      libmsg->md->md_niov, 
+                                      libmsg->md->md_iov.kiov,
+                                      0, nob, 0);
+
+        if (rc != 0) {
+                CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
+                goto failed;
+        }
+        
+        if (type == IBNAL_MSG_GET_RDMA) {
+                /* reply gets finalized when tx completes */
+                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, 
+                                                        nid, libmsg);
+                if (tx->tx_libmsg[1] == NULL) {
+                        CERROR ("Can't create reply for GET -> "LPX64"\n",
+                                nid);
+                        rc = -ENOMEM;
+                        goto failed;
+                }
+        }
+        
+        tx->tx_passive_rdma = 1;
+
+        ibmsg = tx->tx_msg;
+
+        ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+        ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+        /* map_kiov alrady filled the rdma descs for the whole_mem case */
+        if (!kibnal_whole_mem()) {
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey;
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+                ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
+        }
+
+        kibnal_init_tx_msg (tx, type, 
+                            kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
+
+        CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
+               LPX64", nob %d\n",
+               tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
+               tx->tx_md.md_addr, nob);
+        
+        /* libmsg gets finalized when tx completes. */
+        tx->tx_libmsg[0] = libmsg;
+
+        kibnal_launch_tx(tx, nid);
+        return (PTL_OK);
+
+ failed:
+        tx->tx_status = rc;
+        kibnal_tx_done (tx);
+        return (PTL_FAIL);
+}
+
+void
+kibnal_start_active_rdma (int type, int status,
+                           kib_rx_t *rx, lib_msg_t *libmsg, 
+                           unsigned int niov,
+                           struct iovec *iov, ptl_kiov_t *kiov,
+                           size_t offset, size_t nob)
+{
+        kib_msg_t    *rxmsg = rx->rx_msg;
+        kib_msg_t    *txmsg;
+        kib_tx_t     *tx;
+        IB_ACCESS_CONTROL access = {0,};
+        IB_WR_OP      rdma_op;
+        int           rc;
+        __u32         i;
+
+        CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
+               type, status, niov, offset, nob);
+
+        /* Called by scheduler */
+        LASSERT (!in_interrupt ());
+
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+
+        /* No data if we're completing with failure */
+        LASSERT (status == 0 || nob == 0);
+
+        LASSERT (type == IBNAL_MSG_GET_DONE ||
+                 type == IBNAL_MSG_PUT_DONE);
+
+        /* Flag I'm completing the RDMA.  Even if I fail to send the
+         * completion message, I will have tried my best so further
+         * attempts shouldn't be tried. */
+        LASSERT (!rx->rx_rdma);
+        rx->rx_rdma = 1;
+
+        if (type == IBNAL_MSG_GET_DONE) {
+                rdma_op  = WROpRdmaWrite;
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
+        } else {
+                access.s.LocalWrite = 1;
+                rdma_op  = WROpRdmaRead;
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
+        }
+
+        tx = kibnal_get_idle_tx (0);           /* Mustn't block */
+        if (tx == NULL) {
+                CERROR ("tx descs exhausted on RDMA from "LPX64
+                        " completing locally with failure\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+                return;
+        }
+        LASSERT (tx->tx_nsp == 0);
+                        
+        if (nob == 0) 
+                GOTO(init_tx, 0);
+
+        /* We actually need to transfer some data (the transfer
+         * size could get truncated to zero when the incoming
+         * message is matched) */
+        if (kiov != NULL)
+                rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
+        else
+                rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
+        
+        if (rc != 0) {
+                CERROR ("Can't map RDMA -> "LPX64": %d\n", 
+                        rx->rx_conn->ibc_peer->ibp_nid, rc);
+                /* We'll skip the RDMA and complete with failure. */
+                status = rc;
+                nob = 0;
+                GOTO(init_tx, rc);
+        } 
+
+        if (!kibnal_whole_mem()) {
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey;
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
+        }
+
+        /* XXX ugh.  different page-sized hosts. */ 
+        if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
+            rxmsg->ibm_u.rdma.ibrm_num_descs) {
+                CERROR("tx descs (%u) != rx descs (%u)\n", 
+                       tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
+                       rxmsg->ibm_u.rdma.ibrm_num_descs);
+                /* We'll skip the RDMA and complete with failure. */
+                status = rc;
+                nob = 0;
+                GOTO(init_tx, rc);
+        }
+
+        /* map_kiov filled in the rdma descs which describe our side of the
+         * rdma transfer. */
+        /* ibrm_num_descs was verified in rx_callback */
+        for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
+                kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
+                IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i];
+                IB_WORK_REQ  *wrq = &tx->tx_wrq[i];
+
+                ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
+                rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
+
+                ds->Address = ldesc->rd_addr;
+                ds->Length  = ldesc->rd_nob;
+                ds->Lkey    = ldesc->rd_key;
+
+                memset(wrq, 0, sizeof(*wrq));
+                wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
+                wrq->Operation      = rdma_op;
+                wrq->DSList         = ds;
+                wrq->DSListDepth    = 1;
+                wrq->MessageLen     = ds->Length;
+                wrq->Req.SendRC.ImmediateData  = 0;
+                wrq->Req.SendRC.Options.s.SolicitedEvent         = 0;
+                wrq->Req.SendRC.Options.s.SignaledCompletion     = 0;
+                wrq->Req.SendRC.Options.s.ImmediateData          = 0;
+                wrq->Req.SendRC.Options.s.Fence                  = 0;
+                wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr;
+                wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key;
+
+                /* only the last rdma post triggers tx completion */
+                if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
+                        wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+
+                tx->tx_nsp++;
+        }
+
+init_tx:
+        txmsg = tx->tx_msg;
+
+        txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+        txmsg->ibm_u.completion.ibcm_status = status;
+        
+        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+
+        if (status == 0 && nob != 0) {
+                LASSERT (tx->tx_nsp > 1);
+                /* RDMA: libmsg gets finalized when the tx completes.  This
+                 * is after the completion message has been sent, which in
+                 * turn is after the RDMA has finished. */
+                tx->tx_libmsg[0] = libmsg;
+        } else {
+                LASSERT (tx->tx_nsp == 1);
+                /* No RDMA: local completion happens now! */
+                CDEBUG(D_WARNING,"No data: immediate completion\n");
+                lib_finalize (&kibnal_lib, NULL, libmsg,
+                              status == 0 ? PTL_OK : PTL_FAIL);
+        }
+
+        /* +1 ref for this tx... */
+        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+               rx->rx_conn, rx->rx_conn->ibc_state, 
+               rx->rx_conn->ibc_peer->ibp_nid,
+               atomic_read (&rx->rx_conn->ibc_refcount));
+        atomic_inc (&rx->rx_conn->ibc_refcount);
+        /* ...and queue it up */
+        kibnal_queue_tx(tx, rx->rx_conn);
+}
+
+static ptl_err_t
+kibnal_sendmsg(lib_nal_t    *nal, 
+                void         *private,
+                lib_msg_t    *libmsg,
+                ptl_hdr_t    *hdr, 
+                int           type, 
+                ptl_nid_t     nid, 
+                ptl_pid_t     pid,
+                unsigned int  payload_niov, 
+                struct iovec *payload_iov, 
+                ptl_kiov_t   *payload_kiov,
+                size_t        payload_offset,
+                size_t        payload_nob)
+{
+        kib_msg_t  *ibmsg;
+        kib_tx_t   *tx;
+        int         nob;
+
+        /* NB 'private' is different depending on what we're sending.... */
+
+        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
+               " pid %d\n", payload_nob, payload_niov, nid , pid);
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        /* Thread context if we're sending payload */
+        LASSERT (!in_interrupt() || payload_niov == 0);
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+        switch (type) {
+        default:
+                LBUG();
+                return (PTL_FAIL);
+                
+        case PTL_MSG_REPLY: {
+                /* reply's 'private' is the incoming receive */
+                kib_rx_t *rx = private;
+
+                /* RDMA reply expected? */
+                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+                                                 rx, libmsg, payload_niov, 
+                                                 payload_iov, payload_kiov,
+                                                 payload_offset, payload_nob);
+                        return (PTL_OK);
+                }
+                
+                /* Incoming message consistent with immediate reply? */
+                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
+                        CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
+                                nid, rx->rx_msg->ibm_type);
+                        return (PTL_FAIL);
+                }
+
+                /* Will it fit in a message? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob >= IBNAL_MSG_SIZE) {
+                        CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", 
+                               nid, payload_nob);
+                        return (PTL_FAIL);
+                }
+                break;
+        }
+
+        case PTL_MSG_GET:
+                /* might the REPLY message be big enough to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
+                                                          nid, libmsg, hdr));
+                break;
+
+        case PTL_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
+
+        case PTL_MSG_PUT:
+                /* Is the payload big enough to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+                                                          nid, libmsg, hdr));
+                
+                break;
+        }
+
+        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt()));
+        if (tx == NULL) {
+                CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
+                        type, nid, in_interrupt() ? " (intr)" : "");
+                return (PTL_NO_SPACE);
+        }
+
+        ibmsg = tx->tx_msg;
+        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+        if (payload_nob > 0) {
+                if (payload_kiov != NULL)
+                        lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+                                          payload_niov, payload_kiov,
+                                          payload_offset, payload_nob);
+                else
+                        lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+                                         payload_niov, payload_iov,
+                                         payload_offset, payload_nob);
+        }
+
+        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+                            offsetof(kib_immediate_msg_t, 
+                                     ibim_payload[payload_nob]));
+
+        /* libmsg gets finalized when tx completes */
+        tx->tx_libmsg[0] = libmsg;
+
+        kibnal_launch_tx(tx, nid);
+        return (PTL_OK);
+}
+
+static ptl_err_t
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+               unsigned int payload_niov, struct iovec *payload_iov,
+               size_t payload_offset, size_t payload_len)
+{
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, payload_iov, NULL,
+                               payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
+                     size_t payload_offset, size_t payload_len)
+{
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, NULL, payload_kiov,
+                               payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+                 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
+                 size_t offset, size_t mlen, size_t rlen)
+{
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        int          msg_nob;
+        
+        LASSERT (mlen <= rlen);
+        LASSERT (!in_interrupt ());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+
+        switch (rxmsg->ibm_type) {
+        default:
+                LBUG();
+                return (PTL_FAIL);
+                
+        case IBNAL_MSG_IMMEDIATE:
+                msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (msg_nob > IBNAL_MSG_SIZE) {
+                        CERROR ("Immediate message from "LPX64" too big: %d\n",
+                                rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
+                        return (PTL_FAIL);
+                }
+
+                if (kiov != NULL)
+                        lib_copy_buf2kiov(niov, kiov, offset,
+                                          rxmsg->ibm_u.immediate.ibim_payload,
+                                          mlen);
+                else
+                        lib_copy_buf2iov(niov, iov, offset,
+                                         rxmsg->ibm_u.immediate.ibim_payload,
+                                         mlen);
+
+                lib_finalize (nal, NULL, libmsg, PTL_OK);
+                return (PTL_OK);
+
+        case IBNAL_MSG_GET_RDMA:
+                /* We get called here just to discard any junk after the
+                 * GET hdr. */
+                LASSERT (libmsg == NULL);
+                lib_finalize (nal, NULL, libmsg, PTL_OK);
+                return (PTL_OK);
+
+        case IBNAL_MSG_PUT_RDMA:
+                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+                                          rx, libmsg, 
+                                          niov, iov, kiov, offset, mlen);
+                return (PTL_OK);
+        }
+}
+
+static ptl_err_t
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+              unsigned int niov, struct iovec *iov, 
+              size_t offset, size_t mlen, size_t rlen)
+{
+        return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+                                offset, mlen, rlen));
+}
+
+static ptl_err_t
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+                     unsigned int niov, ptl_kiov_t *kiov, 
+                     size_t offset, size_t mlen, size_t rlen)
+{
+        return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+                                offset, mlen, rlen));
+}
+
+/*****************************************************************************
+ * the rest of this file concerns connection management.  active connetions
+ * start with connect_peer, passive connections start with passive_callback.
+ * active disconnects start with conn_close, cm_callback starts passive
+ * disconnects and contains the guts of how the disconnect state machine
+ * progresses. 
+ *****************************************************************************/
+
+int
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&kibnal_data.kib_nthreads);
+        return (0);
+}
+
+static void
+kibnal_thread_fini (void)
+{
+        atomic_dec (&kibnal_data.kib_nthreads);
+}
+
+/* this can be called by anyone at any time to close a connection.  if
+ * the connection is still established it heads to the connd to start
+ * the disconnection in a safe context.  It has no effect if called
+ * on a connection that is already disconnecting */
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+        /* This just does the immmediate housekeeping, and schedules the
+         * connection for the connd to finish off.
+         * Caller holds kib_global_lock exclusively in irq context */
+        kib_peer_t   *peer = conn->ibc_peer;
+
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
+                                    IBNAL_CONN_DISCONNECTED);
+
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+                return; /* already disconnecting */
+
+        CDEBUG (error == 0 ? D_NET : D_ERROR,
+                "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
+
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                /* kib_connd_conns takes ibc_list's ref */
+                list_del (&conn->ibc_list);
+        } else {
+                /* new ref for kib_connd_conns */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount);
+        }
+        
+        if (list_empty (&peer->ibp_conns) &&
+            peer->ibp_persistence == 0) {
+                /* Non-persistent peer with no more conns... */
+                kibnal_unlink_peer_locked (peer);
+        }
+
+        conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+
+        spin_lock (&kibnal_data.kib_connd_lock);
+
+        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+                
+        spin_unlock (&kibnal_data.kib_connd_lock);
+}
+
+void
+kibnal_close_conn (kib_conn_t *conn, int error)
+{
+        unsigned long     flags;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        kibnal_close_conn_locked (conn, error);
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+}
+
+static void
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
+{
+        LIST_HEAD        (zombies);
+        kib_tx_t         *tx;
+        unsigned long     flags;
+
+        LASSERT (rc != 0);
+        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        LASSERT (peer->ibp_connecting != 0);
+        peer->ibp_connecting--;
+
+        if (peer->ibp_connecting != 0) {
+                /* another connection attempt under way (loopback?)... */
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+                return;
+        }
+
+        if (list_empty(&peer->ibp_conns)) {
+                /* Say when active connection can be re-attempted */
+                peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
+                /* Increase reconnection interval */
+                peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
+                                                    IBNAL_MAX_RECONNECT_INTERVAL);
+        
+                /* Take peer's blocked blocked transmits; I'll complete
+                 * them with error */
+                while (!list_empty (&peer->ibp_tx_queue)) {
+                        tx = list_entry (peer->ibp_tx_queue.next,
+                                         kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+                        list_add_tail (&tx->tx_list, &zombies);
+                }
+                
+                if (kibnal_peer_active(peer) &&
+                    (peer->ibp_persistence == 0)) {
+                        /* failed connection attempt on non-persistent peer */
+                        kibnal_unlink_peer_locked (peer);
+                }
+        } else {
+                /* Can't have blocked transmits if there are connections */
+                LASSERT (list_empty(&peer->ibp_tx_queue));
+        }
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        if (!list_empty (&zombies))
+                CERROR ("Deleting messages for "LPX64": connection failed\n",
+                        peer->ibp_nid);
+
+        while (!list_empty (&zombies)) {
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                list_del (&tx->tx_list);
+                /* complete now */
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+        }
+}
+
+static void
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
+{
+        int               state = conn->ibc_state;
+        kib_peer_t       *peer = conn->ibc_peer;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               i;
+
+        /* passive connection has no connreq & vice versa */
+        LASSERTF(!active == !(conn->ibc_connreq != NULL),
+                 "%d %p\n", active, conn->ibc_connreq);
+        if (active) {
+                PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+                conn->ibc_connreq = NULL;
+        }
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        LASSERT (peer->ibp_connecting != 0);
+        
+        if (status == 0) {                         
+                /* connection established... */
+                KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
+                conn->ibc_state = IBNAL_CONN_ESTABLISHED;
+
+                if (!kibnal_peer_active(peer)) {
+                        /* ...but peer deleted meantime */
+                        status = -ECONNABORTED;
+                }
+        } else {
+                KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
+                                            IBNAL_CONN_CONNECTING);
+        }
+
+        if (status == 0) {
+                /* Everything worked! */
+
+                peer->ibp_connecting--;
+
+                /* +1 ref for ibc_list; caller(== CM)'s ref remains until
+                 * the IB_CM_IDLE callback */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount);
+                list_add (&conn->ibc_list, &peer->ibp_conns);
+                
+                /* reset reconnect interval for next attempt */
+                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+                /* post blocked sends to the new connection */
+                spin_lock (&conn->ibc_lock);
+                
+                while (!list_empty (&peer->ibp_tx_queue)) {
+                        tx = list_entry (peer->ibp_tx_queue.next, 
+                                         kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+
+                        /* +1 ref for each tx */
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+                        atomic_inc (&conn->ibc_refcount);
+                        kibnal_queue_tx_locked (tx, conn);
+                }
+                
+                spin_unlock (&conn->ibc_lock);
+
+                /* Nuke any dangling conns from a different peer instance... */
+                kibnal_close_stale_conns_locked (conn->ibc_peer,
+                                                 conn->ibc_incarnation);
+
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+                /* queue up all the receives */
+                for (i = 0; i < IBNAL_RX_MSGS; i++) {
+                        /* +1 ref for rx desc */
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+                        atomic_inc (&conn->ibc_refcount);
+
+                        CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
+                               i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
+                               conn->ibc_rxs[i].rx_vaddr);
+
+                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
+                }
+
+                kibnal_check_sends (conn);
+                return;
+        }
+
+        /* connection failed */
+        if (state == IBNAL_CONN_CONNECTING) {
+                /* schedule for connd to close */
+                kibnal_close_conn_locked (conn, status);
+        } else {
+                /* Don't have a CM comm_id; just wait for refs to drain */
+                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+        } 
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+
+        /* If we didn't establish the connection we don't have to pass
+         * through the disconnect protocol before dropping the CM ref */
+        if (state < IBNAL_CONN_CONNECTING) 
+                kibnal_put_conn (conn);
+}
+
+static int
+kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
+                ptl_nid_t nid, __u64 incarnation, int queue_depth)
+{
+        kib_conn_t    *conn = kibnal_create_conn();
+        kib_peer_t    *peer;
+        kib_peer_t    *peer2;
+        unsigned long  flags;
+
+        if (conn == NULL)
+                return (-ENOMEM);
+
+        if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
+                CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
+                       nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
+                atomic_dec (&conn->ibc_refcount);
+                kibnal_destroy_conn(conn);
+                return (-EPROTO);
+        }
+        
+        /* assume 'nid' is a new peer */
+        peer = kibnal_create_peer (nid);
+        if (peer == NULL) {
+                CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_dec (&conn->ibc_refcount);
+                kibnal_destroy_conn(conn);
+                return (-ENOMEM);
+        }
+        
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        peer2 = kibnal_find_peer_locked(nid);
+        if (peer2 == NULL) {
+                /* peer table takes my ref on peer */
+                list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
+        } else {
+                kib_peer_decref (peer);
+                peer = peer2;
+        }
+
+        kib_peer_addref(peer); /* +1 ref for conn */
+        peer->ibp_connecting++;
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        conn->ibc_peer = peer;
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
+        /* conn->ibc_cep is set when cm_accept is called */
+        conn->ibc_incarnation = incarnation;
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+        *connp = conn;
+        return (0);
+}
+
+static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state)
+{
+        IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,};
+        FSTATUS frc;
+
+        modify_attr.RequestState = state;
+
+        frc = iibt_qp_modify(qp, &modify_attr, NULL);
+        if (frc != FSUCCESS)
+                CERROR("couldn't set qp state to %d, error %d\n", state, frc);
+}
+
+static void kibnal_flush_pending(kib_conn_t *conn)
+{
+        LIST_HEAD        (zombies); 
+        struct list_head *tmp;
+        struct list_head *nxt;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               done;
+
+        /* NB we wait until the connection has closed before completing
+         * outstanding passive RDMAs so we can be sure the network can't 
+         * touch the mapped memory any more. */
+        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
+
+        /* set the QP to the error state so that we get flush callbacks
+         * on our posted receives which can then drop their conn refs */
+        kibnal_set_qp_state(conn->ibc_qp, QPStateError);
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        /* grab passive RDMAs not waiting for the tx callback */
+        list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                /* still waiting for tx callback? */
+                if (!tx->tx_passive_rdma_wait)
+                        continue;
+
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_passive_rdma_wait = 0;
+                done = (tx->tx_sending == 0);
+
+                if (!done)
+                        continue;
+
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+
+        /* grab all blocked transmits */
+        list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+                
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+        
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        while (!list_empty(&zombies)) {
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                list_del(&tx->tx_list);
+                kibnal_tx_done (tx);
+        }
+}
+
+static void
+kibnal_reject (IB_HANDLE cep, uint16_t reason)
+{
+        CM_REJECT_INFO *rej;
+
+        PORTAL_ALLOC(rej, sizeof(*rej));
+        if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */
+                return;  
+
+        rej->Reason = reason;
+        iibt_cm_reject(cep, rej);
+        PORTAL_FREE(rej, sizeof(*rej));
+}
+
+static FSTATUS
+kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, 
+              IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn)
+{
+        IB_QP_ATTRIBUTES_MODIFY modify_attr;
+        FSTATUS frc;
+        ENTRY;
+
+        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState           = QPStateReadyToRecv,
+                .RecvPSN                = IBNAL_STARTING_PSN,
+                .DestQPNumber           = qpn,
+                .ResponderResources     = resp_res,
+                .MinRnrTimer            = UsecToRnrNakTimer(2000), /* 20 ms */
+                .Attrs                  = (IB_QP_ATTR_RECVPSN |
+                                           IB_QP_ATTR_DESTQPNUMBER | 
+                                           IB_QP_ATTR_RESPONDERRESOURCES | 
+                                           IB_QP_ATTR_DESTAV | 
+                                           IB_QP_ATTR_PATHMTU | 
+                                           IB_QP_ATTR_MINRNRTIMER),
+        };
+        GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, 
+                      &modify_attr.DestAV);
+
+        frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+        if (frc != FSUCCESS) 
+                RETURN(frc);
+
+        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState           = QPStateReadyToSend,
+                .FlowControl            = TRUE,
+                .InitiatorDepth         = init_depth,
+                .SendPSN                = send_psn,
+                .LocalAckTimeout        = path->PktLifeTime + 2, /* 2 or 1? */
+                .RetryCount             = IBNAL_RETRY,
+                .RnrRetryCount          = IBNAL_RNR_RETRY,
+                .Attrs                  = (IB_QP_ATTR_FLOWCONTROL | 
+                                           IB_QP_ATTR_INITIATORDEPTH | 
+                                           IB_QP_ATTR_SENDPSN | 
+                                           IB_QP_ATTR_LOCALACKTIMEOUT | 
+                                           IB_QP_ATTR_RETRYCOUNT | 
+                                           IB_QP_ATTR_RNRRETRYCOUNT),
+        };
+
+        frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+        RETURN(frc);
+}
+
+static void
+kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+        kib_conn_t *conn = arg;
+        kib_wire_connreq_t *wcr;
+        CM_REPLY_INFO *rep = &info->Info.Reply;
+        uint16_t reason;
+        FSTATUS frc;
+
+        wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData;
+
+        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+                CERROR ("Can't connect "LPX64": bad magic %08x\n",
+                        conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+        
+        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+                CERROR ("Can't connect "LPX64": bad version %d\n",
+                        conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+                        
+        if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
+                CERROR ("Can't connect "LPX64": bad queue depth %d\n",
+                        conn->ibc_peer->ibp_nid, 
+                        le16_to_cpu(wcr->wcr_queue_depth));
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+                        
+        if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
+                CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
+                        le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+
+        CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+        conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+        frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, 
+                            min_t(__u8, rep->ArbInitiatorDepth,
+                                  ca_attr->MaxQPResponderResources),
+                            &conn->ibc_connreq->cr_path, 
+                            min_t(__u8, rep->ArbResponderResources,
+                                  ca_attr->MaxQPInitiatorDepth),
+                            rep->StartingPSN);
+        if (frc != FSUCCESS) {
+                CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n",
+                       conn, conn->ibc_peer->ibp_nid, frc);
+                GOTO(reject, reason = RC_NO_QP);
+        }
+
+        /* the callback arguments are ignored for an active accept */
+        conn->ibc_connreq->cr_discarded.Status = FSUCCESS;
+        frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, 
+                             NULL, NULL, NULL, NULL);
+        if (frc != FCM_CONNECT_ESTABLISHED) {
+                CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n",
+                       conn, conn->ibc_peer->ibp_nid, frc);
+                kibnal_connreq_done (conn, 1, -ECONNABORTED);
+                /* XXX don't call reject after accept fails? */
+                return;
+        }
+
+        CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+        kibnal_connreq_done (conn, 1, 0);
+        return;
+
+reject:
+        kibnal_reject(cep, reason);
+        kibnal_connreq_done (conn, 1, -EPROTO);
+}
+
+/* ib_cm.h has a wealth of information on the CM procedures */
+static void
+kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        kib_conn_t       *conn = arg;
+
+        CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+        /* Established Connection Notifier */
+        switch (info->Status) {
+        default:
+                CERROR("unknown status %d on Connection %p -> "LPX64"\n",
+                       info->Status, conn, conn->ibc_peer->ibp_nid);
+                LBUG();
+                break;
+
+        case FCM_CONNECT_REPLY:
+                kibnal_connect_reply(cep, info, arg);
+                break;
+
+        case FCM_DISCONNECT_REQUEST:
+                /* XXX lock around these state management bits? */
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+                        kibnal_close_conn (conn, 0);
+                conn->ibc_state = IBNAL_CONN_DREP;
+                iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+                break;
+
+        /* these both guarantee that no more cm callbacks will occur */
+        case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */
+        case FCM_DISCONNECT_REPLY:
+                CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n",
+                       conn, conn->ibc_peer->ibp_nid);
+
+                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+                kibnal_flush_pending(conn);
+                kibnal_put_conn(conn);        /* Lose CM's ref */
+                break;
+        }
+
+        return;
+}
+
+static int
+kibnal_set_cm_flags(IB_HANDLE cep)
+{
+        FSTATUS frc;
+        uint32 value = 1;
+
+        frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
+                                 (char *)&value, sizeof(value), 0);
+        if (frc != FSUCCESS) {
+                CERROR("error setting timeout callback: %d\n", frc);
+                return -1;
+        }
+
+#if 0
+        frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
+                                 sizeof(value), 0);
+        if (frc != FSUCCESS) {
+                CERROR("error setting async accept: %d\n", frc);
+                return -1;
+        }
+#endif
+
+        return 0;
+}
+
+void
+kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+        IB_QP_ATTRIBUTES_QUERY *query;
+        CM_REQUEST_INFO    *req;
+        CM_CONN_INFO       *rep = NULL, *rcv = NULL;
+        kib_wire_connreq_t *wcr;
+        kib_conn_t         *conn = NULL;
+        uint16_t            reason = 0;
+        FSTATUS             frc;
+        int                 rc = 0;
+        
+        LASSERT(cep);
+        LASSERT(info);
+        LASSERT(arg == NULL); /* no conn yet for passive */
+
+        CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+        req = &info->Info.Request;
+        wcr = (kib_wire_connreq_t *)req->PrivateData;
+
+        CDEBUG(D_NET, "%d from "LPX64"\n", info->Status, 
+               le64_to_cpu(wcr->wcr_nid));
+        
+        if (info->Status == FCM_CONNECT_CANCEL)
+                return;
+        
+        LASSERT (info->Status == FCM_CONNECT_REQUEST);
+        
+        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+                CERROR ("Can't accept: bad magic %08x\n",
+                        le32_to_cpu(wcr->wcr_magic));
+                GOTO(out, reason = RC_USER_REJ);
+        }
+
+        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+                CERROR ("Can't accept: bad version %d\n",
+                        le16_to_cpu(wcr->wcr_magic));
+                GOTO(out, reason = RC_USER_REJ);
+        }
+
+        rc = kibnal_accept(&conn, cep,
+                           le64_to_cpu(wcr->wcr_nid),
+                           le64_to_cpu(wcr->wcr_incarnation),
+                           le16_to_cpu(wcr->wcr_queue_depth));
+        if (rc != 0) {
+                CERROR ("Can't accept "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), rc);
+                GOTO(out, reason = RC_NO_RESOURCES);
+        }
+
+        frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN,
+                            min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, 
+                                  ca_attr->MaxQPResponderResources),
+                            &req->PathInfo.Path,
+                            min_t(__u8, req->CEPInfo.OfferedResponderResources, 
+                                  ca_attr->MaxQPInitiatorDepth),
+                            req->CEPInfo.StartingPSN);
+
+        if (frc != FSUCCESS) {
+                CERROR ("Can't mark QP RTS/RTR  "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), frc);
+                GOTO(out, reason = RC_NO_QP);
+        }
+
+        frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Couldn't query qp attributes "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), frc);
+                GOTO(out, reason = RC_NO_QP);
+        }
+        query = &conn->ibc_qp_attrs;
+
+        PORTAL_ALLOC(rep, sizeof(*rep));
+        PORTAL_ALLOC(rcv, sizeof(*rcv));
+        if (rep == NULL || rcv == NULL) {
+                CERROR ("can't reply and receive buffers\n");
+                GOTO(out, reason = RC_INSUFFICIENT_RESP_RES);
+        }
+
+        /* don't try to deref this into the incoming wcr :) */
+        wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData;
+
+        rep->Info.Reply = (CM_REPLY_INFO) {
+                .QPN = query->QPNumber,
+                .QKey = query->Qkey,
+                .StartingPSN = query->RecvPSN,
+                .EndToEndFlowControl = query->FlowControl,
+                /* XXX Hmm. */
+                .ArbInitiatorDepth = query->InitiatorDepth,
+                .ArbResponderResources = query->ResponderResources,
+                .TargetAckDelay = 0,
+                .FailoverAccepted = 0,
+                .RnRRetryCount = req->CEPInfo.RnrRetryCount,
+        };
+                
+        *wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+        };
+
+        frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, 
+                             &conn->ibc_cep);
+
+        PORTAL_FREE(rep, sizeof(*rep));
+        PORTAL_FREE(rcv, sizeof(*rcv));
+
+        if (frc != FCM_CONNECT_ESTABLISHED) {
+                /* XXX it seems we don't call reject after this point? */
+                CERROR("iibt_cm_accept() failed: %d, aborting\n", frc);
+                rc = -ECONNABORTED;
+                goto out;
+        }
+
+        if (kibnal_set_cm_flags(conn->ibc_cep)) {
+                rc = -ECONNABORTED;
+                goto out;
+        }
+
+        CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+out:
+        if (reason) {
+                kibnal_reject(cep, reason);
+                rc = -ECONNABORTED;
+        }
+        if (conn != NULL) 
+                kibnal_connreq_done(conn, 0, rc);
+
+        return;
+}
+
+static void
+dump_path_records(PATH_RESULTS *results)
+{
+        IB_PATH_RECORD *path;
+        int i;
+
+        for(i = 0; i < results->NumPathRecords; i++) {
+                path = &results->PathRecords[i];
+                CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid "
+                       LPX64":"LPX64" pkey %x\n",
+                       i,
+                       path->SGID.Type.Global.SubnetPrefix,
+                       path->SGID.Type.Global.InterfaceID,
+                       path->DGID.Type.Global.SubnetPrefix,
+                       path->DGID.Type.Global.InterfaceID,
+                       path->P_Key);
+        }
+}
+
+static void
+kibnal_pathreq_callback (void *arg, QUERY *query, 
+                         QUERY_RESULT_VALUES *query_res)
+{
+        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+        kib_conn_t *conn = arg;
+        PATH_RESULTS *path;
+        FSTATUS frc;
+        
+        if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+                CERROR ("status %d data size %d\n", query_res->Status,
+                        query_res->ResultDataSize);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        path = (PATH_RESULTS *)query_res->QueryResult;
+
+        if (path->NumPathRecords < 1) {
+                CERROR ("expected path records: %d\n", path->NumPathRecords);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        dump_path_records(path);
+
+        /* just using the first.  this is probably a horrible idea. */
+        conn->ibc_connreq->cr_path = path->PathRecords[0];
+
+        conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE);
+        if (conn->ibc_cep == NULL) {
+                CERROR ("Can't create CEP\n");
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        if (kibnal_set_cm_flags(conn->ibc_cep)) {
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+        };
+
+        conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) {
+                .SID = conn->ibc_connreq->cr_service.RID.ServiceID,
+                .CEPInfo = (CM_CEP_INFO) { 
+                        .CaGUID = kibnal_data.kib_hca_guids[0],
+                        .EndToEndFlowControl = FALSE,
+                        .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID,
+                        .RetryCount = IBNAL_RETRY,
+                        .RnrRetryCount = IBNAL_RNR_RETRY,
+                        .AckTimeout = IBNAL_ACK_TIMEOUT,
+                        .StartingPSN = IBNAL_STARTING_PSN,
+                        .QPN = conn->ibc_qp_attrs.QPNumber,
+                        .QKey = conn->ibc_qp_attrs.Qkey,
+                        .OfferedResponderResources = ca_attr->MaxQPResponderResources,
+                        .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth,
+                },
+                .PathInfo = (CM_CEP_PATHINFO) {
+                        .bSubnetLocal = TRUE,
+                        .Path = conn->ibc_connreq->cr_path,
+                },
+        };
+
+#if 0
+        /* XXX set timeout just like SDP!!!*/
+        conn->ibc_connreq->cr_path.packet_life = 13;
+#endif
+        /* Flag I'm getting involved with the CM... */
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
+
+        CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
+               conn->ibc_connreq->cr_service.RID.ServiceID, 
+               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+        memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, 
+               CM_REQUEST_INFO_USER_LEN);
+        memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, 
+               &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
+
+        /* kibnal_cm_callback gets my conn ref */
+        frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq,
+                              kibnal_cm_callback, conn);
+        if (frc != FPENDING && frc != FSUCCESS) {
+                CERROR ("Connect: %d\n", frc);
+                /* Back out state change as connect failed */
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
+                kibnal_connreq_done (conn, 1, -EINVAL);
+        }
+}
+
+static void
+dump_service_records(SERVICE_RECORD_RESULTS *results)
+{
+        IB_SERVICE_RECORD *svc;
+        int i;
+
+        for(i = 0; i < results->NumServiceRecords; i++) {
+                svc = &results->ServiceRecords[i];
+                CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
+                       i,
+                       svc->RID.ServiceID,
+                       svc->RID.ServiceGID.Type.Global.SubnetPrefix,
+                       svc->RID.ServiceGID.Type.Global.InterfaceID,
+                       svc->RID.ServiceP_Key);
+        }
+}
+
+
+static void
+kibnal_service_get_callback (void *arg, QUERY *query, 
+                             QUERY_RESULT_VALUES *query_res)
+{
+        kib_conn_t *conn = arg;
+        SERVICE_RECORD_RESULTS *svc;
+        COMMAND_CONTROL_PARAMETERS sd_params;
+        QUERY   path_query;
+        FSTATUS frc;
+        
+        if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+                CERROR ("status %d data size %d\n", query_res->Status,
+                        query_res->ResultDataSize);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult;
+
+        if (svc->NumServiceRecords < 1) {
+                CERROR ("%d service records\n", svc->NumServiceRecords);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        dump_service_records(svc);
+
+        conn->ibc_connreq->cr_service = svc->ServiceRecords[0];
+
+        CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
+               query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, 
+               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+        memset(&path_query, 0, sizeof(path_query));
+        path_query.InputType = InputTypePortGuidPair;
+        path_query.OutputType = OutputTypePathRecord;
+        path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid;
+        path_query.InputValue.PortGuidPair.DestPortGuid  = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID;
+
+        memset(&sd_params, 0, sizeof(sd_params));
+        sd_params.RetryCount = IBNAL_RETRY;
+        sd_params.Timeout = 10 * 1000;   /* wait 10 seconds */
+
+        /* kibnal_service_get_callback gets my conn ref */
+
+        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+                                                    kibnal_data.kib_port_guid,
+                                                    &path_query, 
+                                                    kibnal_pathreq_callback,
+                                                    &sd_params, conn);
+        if (frc == FPENDING)
+                return;
+
+        CERROR ("Path record request failed: %d\n", frc);
+        kibnal_connreq_done (conn, 1, -EINVAL);
+}
+
+static void
+kibnal_connect_peer (kib_peer_t *peer)
+{
+        COMMAND_CONTROL_PARAMETERS sd_params;
+        QUERY   query;
+        FSTATUS frc;
+        kib_conn_t  *conn = kibnal_create_conn();
+
+        LASSERT (peer->ibp_connecting != 0);
+
+        if (conn == NULL) {
+                CERROR ("Can't allocate conn\n");
+                kibnal_peer_connect_failed (peer, 1, -ENOMEM);
+                return;
+        }
+
+        conn->ibc_peer = peer;
+        kib_peer_addref(peer);
+
+        PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+        if (conn->ibc_connreq == NULL) {
+                CERROR ("Can't allocate connreq\n");
+                kibnal_connreq_done (conn, 1, -ENOMEM);
+                return;
+        }
+
+        memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+
+        kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+
+        memset(&query, 0, sizeof(query));
+        query.InputType = InputTypeServiceRecord;
+        query.OutputType = OutputTypeServiceRecord;
+        query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service;
+        query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+
+        memset(&sd_params, 0, sizeof(sd_params));
+        sd_params.RetryCount = IBNAL_RETRY;
+        sd_params.Timeout = 10 * 1000;   /* wait 10 seconds */
+
+        /* kibnal_service_get_callback gets my conn ref */
+        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+                                                    kibnal_data.kib_port_guid,
+                                                    &query, 
+                                                kibnal_service_get_callback, 
+                                                    &sd_params, conn);
+        if (frc == FPENDING)
+                return;
+
+        CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc);
+        kibnal_connreq_done (conn, 1, frc);
+}
+
+static int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+        kib_tx_t          *tx;
+        struct list_head  *ttmp;
+        unsigned long      flags;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        list_for_each (ttmp, &conn->ibc_tx_queue) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_sending == 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        return 0;
+}
+
+static void
+kibnal_check_conns (int idx)
+{
+        struct list_head  *peers = &kibnal_data.kib_peers[idx];
+        struct list_head  *ptmp;
+        kib_peer_t        *peer;
+        kib_conn_t        *conn;
+        struct list_head  *ctmp;
+
+ again:
+        /* NB. We expect to have a look at all the peers and not find any
+         * rdmas to time out, so we just use a shared lock while we
+         * take a look... */
+        read_lock (&kibnal_data.kib_global_lock);
+
+        list_for_each (ptmp, peers) {
+                peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+                list_for_each (ctmp, &peer->ibp_conns) {
+                        conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+
+                        /* In case we have enough credits to return via a
+                         * NOOP, but there were no non-blocking tx descs
+                         * free to do it last time... */
+                        kibnal_check_sends(conn);
+
+                        if (!kibnal_conn_timed_out(conn))
+                                continue;
+                        
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+
+                        atomic_inc (&conn->ibc_refcount);
+                        read_unlock (&kibnal_data.kib_global_lock);
+
+                        CERROR("Timed out RDMA with "LPX64"\n",
+                               peer->ibp_nid);
+
+                        kibnal_close_conn (conn, -ETIMEDOUT);
+                        kibnal_put_conn (conn);
+
+                        /* start again now I've dropped the lock */
+                        goto again;
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+}
+
+static void
+kib_connd_handle_state(kib_conn_t *conn)
+{
+        FSTATUS frc;
+
+        switch (conn->ibc_state) {
+                /* all refs have gone, free and be done with it */ 
+                case IBNAL_CONN_DISCONNECTED:
+                        kibnal_destroy_conn (conn);
+                        return; /* avoid put_conn */
+
+                case IBNAL_CONN_SEND_DREQ:
+                        frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+                        if (frc != FSUCCESS) /* XXX do real things */
+                                CERROR("disconnect failed: %d\n", frc);
+                        conn->ibc_state = IBNAL_CONN_DREQ;
+                        break;
+
+                /* a callback got to the conn before we did */ 
+                case IBNAL_CONN_DREP:
+                        break;
+                                
+                default:
+                        CERROR ("Bad conn %p state: %d\n", conn, 
+                                conn->ibc_state);
+                        LBUG();
+                        break;
+        }
+
+        /* drop ref from close_conn */
+        kibnal_put_conn(conn);
+}
+
+int
+kibnal_connd (void *arg)
+{
+        wait_queue_t       wait;
+        unsigned long      flags;
+        kib_conn_t        *conn;
+        kib_peer_t        *peer;
+        int                timeout;
+        int                i;
+        int                peer_index = 0;
+        unsigned long      deadline = jiffies;
+        
+        kportal_daemonize ("kibnal_connd");
+        kportal_blockallsigs ();
+
+        init_waitqueue_entry (&wait, current);
+
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+        for (;;) {
+                if (!list_empty (&kibnal_data.kib_connd_conns)) {
+                        conn = list_entry (kibnal_data.kib_connd_conns.next,
+                                           kib_conn_t, ibc_list);
+                        list_del (&conn->ibc_list);
+                        
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+                        kib_connd_handle_state(conn);
+
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                        continue;
+                }
+
+                if (!list_empty (&kibnal_data.kib_connd_peers)) {
+                        peer = list_entry (kibnal_data.kib_connd_peers.next,
+                                           kib_peer_t, ibp_connd_list);
+                        
+                        list_del_init (&peer->ibp_connd_list);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+                        kibnal_connect_peer (peer);
+                        kib_peer_decref (peer);
+
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                }
+
+                /* shut down and nobody left to reap... */
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
+                        break;
+
+                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+                /* careful with the jiffy wrap... */
+                while ((timeout = (int)(deadline - jiffies)) <= 0) {
+                        const int n = 4;
+                        const int p = 1;
+                        int       chunk = kibnal_data.kib_peer_hash_size;
+                        
+                        /* Time to check for RDMA timeouts on a few more
+                         * peers: I do checks every 'p' seconds on a
+                         * proportion of the peer table and I need to check
+                         * every connection 'n' times within a timeout
+                         * interval, to ensure I detect a timeout on any
+                         * connection within (n+1)/n times the timeout
+                         * interval. */
+
+                        if (kibnal_tunables.kib_io_timeout > n * p)
+                                chunk = (chunk * n * p) / 
+                                        kibnal_tunables.kib_io_timeout;
+                        if (chunk == 0)
+                                chunk = 1;
+
+                        for (i = 0; i < chunk; i++) {
+                                kibnal_check_conns (peer_index);
+                                peer_index = (peer_index + 1) % 
+                                             kibnal_data.kib_peer_hash_size;
+                        }
+
+                        deadline += p * HZ;
+                }
+
+                kibnal_data.kib_connd_waketime = jiffies + timeout;
+
+                set_current_state (TASK_INTERRUPTIBLE);
+                add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+                if (!kibnal_data.kib_shutdown &&
+                    list_empty (&kibnal_data.kib_connd_conns) &&
+                    list_empty (&kibnal_data.kib_connd_peers))
+                        schedule_timeout (timeout);
+
+                set_current_state (TASK_RUNNING);
+                remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+                spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+        kibnal_thread_fini ();
+        return (0);
+}
+
+int
+kibnal_scheduler(void *arg)
+{
+        long            id = (long)arg;
+        char            name[16];
+        kib_rx_t       *rx;
+        kib_tx_t       *tx;
+        unsigned long   flags;
+        int             rc;
+        int             counter = 0;
+        int             did_something;
+
+        snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
+        kportal_daemonize(name);
+        kportal_blockallsigs();
+
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+
+        for (;;) {
+                did_something = 0;
+
+                while (!list_empty(&kibnal_data.kib_sched_txq)) {
+                        tx = list_entry(kibnal_data.kib_sched_txq.next,
+                                        kib_tx_t, tx_list);
+                        list_del(&tx->tx_list);
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+                        kibnal_tx_done(tx);
+
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+
+                if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+                        rx = list_entry(kibnal_data.kib_sched_rxq.next,
+                                        kib_rx_t, rx_list);
+                        list_del(&rx->rx_list);
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+
+                        kibnal_rx(rx);
+
+                        did_something = 1;
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+
+                /* shut down and no receives to complete... */
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
+                        break;
+
+                /* nothing to do or hogging CPU */
+                if (!did_something || counter++ == IBNAL_RESCHED) {
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+                        counter = 0;
+
+                        if (!did_something) {
+                                rc = wait_event_interruptible(
+                                        kibnal_data.kib_sched_waitq,
+                                        !list_empty(&kibnal_data.kib_sched_txq) || 
+                                        !list_empty(&kibnal_data.kib_sched_rxq) || 
+                                        (kibnal_data.kib_shutdown &&
+                                         atomic_read (&kibnal_data.kib_nconns) == 0));
+                        } else {
+                                our_cond_resched();
+                        }
+
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+        }
+
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+
+        kibnal_thread_fini();
+        return (0);
+}
+
+
+lib_nal_t kibnal_lib = {
+        libnal_data:        &kibnal_data,      /* NAL private data */
+        libnal_send:         kibnal_send,
+        libnal_send_pages:   kibnal_send_pages,
+        libnal_recv:         kibnal_recv,
+        libnal_recv_pages:   kibnal_recv_pages,
+        libnal_dist:         kibnal_dist
+};
diff --git a/lnet/klnds/openiblnd/.cvsignore b/lnet/klnds/openiblnd/.cvsignore
new file mode 100644 (file)
index 0000000..5ed596b
--- /dev/null
@@ -0,0 +1,10 @@
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
index 6f66143..652eb34 100644 (file)
 
 #include "openibnal.h"
 
-nal_t                   koibnal_api;
-ptl_handle_ni_t         koibnal_ni;
-koib_data_t             koibnal_data;
-koib_tunables_t         koibnal_tunables;
+nal_t                   kibnal_api;
+ptl_handle_ni_t         kibnal_ni;
+kib_data_t              kibnal_data;
+kib_tunables_t          kibnal_tunables;
 
 #ifdef CONFIG_SYSCTL
-#define OPENIBNAL_SYSCTL        202
+#define IBNAL_SYSCTL             202
 
-#define OPENIBNAL_SYSCTL_TIMEOUT     1
-#define OPENIBNAL_SYSCTL_ZERO_COPY   2
+#define IBNAL_SYSCTL_TIMEOUT     1
 
-static ctl_table koibnal_ctl_table[] = {
-        {OPENIBNAL_SYSCTL_TIMEOUT, "timeout", 
-         &koibnal_tunables.koib_io_timeout, sizeof (int),
+static ctl_table kibnal_ctl_table[] = {
+        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
+         &kibnal_tunables.kib_io_timeout, sizeof (int),
          0644, NULL, &proc_dointvec},
         { 0 }
 };
 
-static ctl_table koibnal_top_ctl_table[] = {
-        {OPENIBNAL_SYSCTL, "openibnal", NULL, 0, 0555, koibnal_ctl_table},
+static ctl_table kibnal_top_ctl_table[] = {
+        {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
         { 0 }
 };
 #endif
@@ -66,167 +65,183 @@ print_service(struct ib_common_attrib_service *service, char *tag, int rc)
               "service id: "LPX64"\n"
               "name      : %s\n"
               "NID       : "LPX64"\n", tag, rc,
-              service->service_id, name, service->service_data64[0]);
+              service->service_id, name, 
+              *kibnal_service_nid_field(service));
 }
 
 void
-koibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
+kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
                                struct ib_common_attrib_service *service, void *arg)
 {
         *(int *)arg = status;
-        up (&koibnal_data.koib_nid_signal);
+        up (&kibnal_data.kib_nid_signal);
 }
 
+#if IBNAL_CHECK_ADVERT
+void
+kibnal_check_advert (void)
+{
+        struct ib_common_attrib_service *svc;
+        __u64   tid;
+        int     rc;
+        int     rc2;
+
+        PORTAL_ALLOC(svc, sizeof(*svc));
+        if (svc == NULL)
+                return;
+
+        memset (svc, 0, sizeof (*svc));
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+        rc = ib_service_get (kibnal_data.kib_device, 
+                             kibnal_data.kib_port,
+                             svc,
+                             KIBNAL_SERVICE_KEY_MASK,
+                             kibnal_tunables.kib_io_timeout * HZ,
+                             kibnal_service_setunset_done, &rc2, 
+                             &tid);
+
+        if (rc != 0) {
+                CERROR ("Immediate error %d checking SM service\n", rc);
+        } else {
+                down (&kibnal_data.kib_nid_signal);
+                rc = rc2;
+
+                if (rc != 0)
+                        CERROR ("Error %d checking SM service\n", rc);
+        }
+
+        PORTAL_FREE(svc, sizeof(*svc));
+}
+#endif
+
 int
-koibnal_advertise (void)
+kibnal_advertise (void)
 {
+        struct ib_common_attrib_service *svc;
         __u64   tid;
         int     rc;
         int     rc2;
 
-        LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC(svc, sizeof(*svc));
+        if (svc == NULL)
+                return (-ENOMEM);
 
-        memset (&koibnal_data.koib_service, 0, 
-                sizeof (koibnal_data.koib_service));
+        memset (svc, 0, sizeof (*svc));
         
-        koibnal_data.koib_service.service_id
-                = koibnal_data.koib_cm_service_id;
+        svc->service_id = kibnal_data.kib_service_id;
 
-        rc = ib_cached_gid_get(koibnal_data.koib_device,
-                               koibnal_data.koib_port,
+        rc = ib_cached_gid_get(kibnal_data.kib_device,
+                               kibnal_data.kib_port,
                                0,
-                               koibnal_data.koib_service.service_gid);
+                               svc->service_gid);
         if (rc != 0) {
                 CERROR ("Can't get port %d GID: %d\n",
-                        koibnal_data.koib_port, rc);
-                return (rc);
+                        kibnal_data.kib_port, rc);
+                goto out;
         }
         
-        rc = ib_cached_pkey_get(koibnal_data.koib_device,
-                                koibnal_data.koib_port,
+        rc = ib_cached_pkey_get(kibnal_data.kib_device,
+                                kibnal_data.kib_port,
                                 0,
-                                &koibnal_data.koib_service.service_pkey);
+                                &svc->service_pkey);
         if (rc != 0) {
                 CERROR ("Can't get port %d PKEY: %d\n",
-                        koibnal_data.koib_port, rc);
-                return (rc);
+                        kibnal_data.kib_port, rc);
+                goto out;
         }
         
-        koibnal_data.koib_service.service_lease = 0xffffffff;
+        svc->service_lease = 0xffffffff;
 
-        koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
 
         CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
-               koibnal_data.koib_service.service_id,
-               koibnal_data.koib_service.service_name, 
-               *koibnal_service_nid_field(&koibnal_data.koib_service));
+               svc->service_id, 
+               svc->service_name, *kibnal_service_nid_field(svc));
 
-        rc = ib_service_set (koibnal_data.koib_device,
-                             koibnal_data.koib_port,
-                             &koibnal_data.koib_service,
+        rc = ib_service_set (kibnal_data.kib_device,
+                             kibnal_data.kib_port,
+                             svc,
                              IB_SA_SERVICE_COMP_MASK_ID |
                              IB_SA_SERVICE_COMP_MASK_GID |
                              IB_SA_SERVICE_COMP_MASK_PKEY |
                              IB_SA_SERVICE_COMP_MASK_LEASE |
-                             KOIBNAL_SERVICE_KEY_MASK,
-                             koibnal_tunables.koib_io_timeout * HZ,
-                             koibnal_service_setunset_done, &rc2, &tid);
+                             KIBNAL_SERVICE_KEY_MASK,
+                             kibnal_tunables.kib_io_timeout * HZ,
+                             kibnal_service_setunset_done, &rc2, &tid);
 
-        if (rc == 0) {
-                down (&koibnal_data.koib_nid_signal);
-                rc = rc2;
+        if (rc != 0) {
+                CERROR ("Immediate error %d advertising NID "LPX64"\n",
+                        rc, kibnal_data.kib_nid);
+                goto out;
         }
-        
-        if (rc != 0)
-                CERROR ("Error %d advertising SM service\n", rc);
 
+        down (&kibnal_data.kib_nid_signal);
+
+        rc = rc2;
+        if (rc != 0)
+                CERROR ("Error %d advertising NID "LPX64"\n", 
+                        rc, kibnal_data.kib_nid);
+ out:
+        PORTAL_FREE(svc, sizeof(*svc));
         return (rc);
 }
 
-int
-koibnal_unadvertise (int expect_success)
+void
+kibnal_unadvertise (int expect_success)
 {
+        struct ib_common_attrib_service *svc;
         __u64   tid;
         int     rc;
         int     rc2;
 
-        LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
 
-        memset (&koibnal_data.koib_service, 0,
-                sizeof (koibnal_data.koib_service));
+        PORTAL_ALLOC(svc, sizeof(*svc));
+        if (svc == NULL)
+                return;
 
-        koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
+        memset (svc, 0, sizeof(*svc));
+
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
 
         CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
-               koibnal_data.koib_service.service_name,
-               *koibnal_service_nid_field(&koibnal_data.koib_service));
-
-        rc = ib_service_delete (koibnal_data.koib_device,
-                                koibnal_data.koib_port,
-                                &koibnal_data.koib_service,
-                                KOIBNAL_SERVICE_KEY_MASK,
-                                koibnal_tunables.koib_io_timeout * HZ,
-                                koibnal_service_setunset_done, &rc2, &tid);
+               svc->service_name, *kibnal_service_nid_field(svc));
+
+        rc = ib_service_delete (kibnal_data.kib_device,
+                                kibnal_data.kib_port,
+                                svc,
+                                KIBNAL_SERVICE_KEY_MASK,
+                                kibnal_tunables.kib_io_timeout * HZ,
+                                kibnal_service_setunset_done, &rc2, &tid);
         if (rc != 0) {
                 CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
-                        rc, koibnal_data.koib_nid);
-                return (rc);
+                        rc, kibnal_data.kib_nid);
+                goto out;
         }
 
-        down (&koibnal_data.koib_nid_signal);
+        down (&kibnal_data.kib_nid_signal);
         
         if ((rc2 == 0) == !!expect_success)
-                return (0);
+                goto out;                       /* success: rc == 0 */
 
         if (expect_success)
                 CERROR("Error %d unadvertising NID "LPX64"\n",
-                        rc, koibnal_data.koib_nid);
+                       rc, kibnal_data.kib_nid);
         else
                 CWARN("Removed conflicting NID "LPX64"\n",
-                      koibnal_data.koib_nid);
-
-        return (rc);
-}
-
-int
-koibnal_check_advert (void)
-{
-        __u64   tid;
-        int     rc;
-        int     rc2;
-
-        static struct ib_common_attrib_service srv;
-
-        memset (&srv, 0, sizeof (srv));
-
-        koibnal_set_service_keys(&srv, koibnal_data.koib_nid);
-
-        rc = ib_service_get (koibnal_data.koib_device, 
-                             koibnal_data.koib_port,
-                             &srv,
-                             KOIBNAL_SERVICE_KEY_MASK,
-                             koibnal_tunables.koib_io_timeout * HZ,
-                             koibnal_service_setunset_done, &rc2, 
-                             &tid);
-
-        if (rc != 0) {
-                CERROR ("Immediate error %d checking SM service\n", rc);
-        } else {
-                down (&koibnal_data.koib_nid_signal);
-                rc = rc2;
-
-                if (rc != 0)
-                        CERROR ("Error %d checking SM service\n", rc);
-        }
-
-        return (rc);
+                      kibnal_data.kib_nid);
+ out:
+        PORTAL_FREE(svc, sizeof(*svc));
 }
 
 int
-koibnal_set_mynid(ptl_nid_t nid)
+kibnal_set_mynid(ptl_nid_t nid)
 {
         struct timeval tv;
-        lib_ni_t      *ni = &koibnal_lib.libnal_ni;
+        lib_ni_t      *ni = &kibnal_lib.libnal_ni;
         int            rc;
 
         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
@@ -234,75 +249,76 @@ koibnal_set_mynid(ptl_nid_t nid)
 
         do_gettimeofday(&tv);
 
-        down (&koibnal_data.koib_nid_mutex);
+        down (&kibnal_data.kib_nid_mutex);
 
-        if (nid == koibnal_data.koib_nid) {
+        if (nid == kibnal_data.kib_nid) {
                 /* no change of NID */
-                up (&koibnal_data.koib_nid_mutex);
+                up (&kibnal_data.kib_nid_mutex);
                 return (0);
         }
 
         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
-               koibnal_data.koib_nid, nid);
+               kibnal_data.kib_nid, nid);
         
-        if (koibnal_data.koib_nid != PTL_NID_ANY) {
+        if (kibnal_data.kib_nid != PTL_NID_ANY) {
 
-                koibnal_unadvertise (1);
+                kibnal_unadvertise (1);
 
-                rc = ib_cm_listen_stop (koibnal_data.koib_listen_handle);
+                rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
                 if (rc != 0)
                         CERROR ("Error %d stopping listener\n", rc);
         }
         
-        koibnal_data.koib_nid = ni->ni_pid.nid = nid;
-        koibnal_data.koib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+        kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
         
         /* Delete all existing peers and their connections after new
          * NID/incarnation set to ensure no old connections in our brave
          * new world. */
-        koibnal_del_peer (PTL_NID_ANY, 0);
-
-        rc = 0;
-        if (koibnal_data.koib_nid != PTL_NID_ANY) {
-                /* New NID installed */
+        kibnal_del_peer (PTL_NID_ANY, 0);
 
-                /* remove any previous advert (crashed node etc) */
-                koibnal_unadvertise(0);
+        if (kibnal_data.kib_nid == PTL_NID_ANY) {
+                /* No new NID to install */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+        
+        /* remove any previous advert (crashed node etc) */
+        kibnal_unadvertise(0);
 
-                /* Assign new service number */
-                koibnal_data.koib_cm_service_id = ib_cm_service_assign();
-                CDEBUG(D_NET, "service_id "LPX64"\n", koibnal_data.koib_cm_service_id);
+        /* Assign new service number */
+        kibnal_data.kib_service_id = ib_cm_service_assign();
+        CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id);
         
-                rc = ib_cm_listen(koibnal_data.koib_cm_service_id,
-                                  TS_IB_CM_SERVICE_EXACT_MASK,
-                                  koibnal_passive_conn_callback, NULL,
-                                  &koibnal_data.koib_listen_handle);
-                if (rc != 0) {
-                        CERROR ("ib_cm_listen error: %d\n", rc);
-                        goto out;
+        rc = ib_cm_listen(kibnal_data.kib_service_id,
+                          TS_IB_CM_SERVICE_EXACT_MASK,
+                          kibnal_passive_conn_callback, NULL,
+                          &kibnal_data.kib_listen_handle);
+        if (rc == 0) {
+                rc = kibnal_advertise();
+                if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+                        kibnal_check_advert();
+#endif
+                        up (&kibnal_data.kib_nid_mutex);
+                        return (0);
                 }
 
-                rc = koibnal_advertise();
-
-                koibnal_check_advert();
-        }
-        
- out:
-        if (rc != 0) {
-                koibnal_data.koib_nid = PTL_NID_ANY;
+                ib_cm_listen_stop(kibnal_data.kib_listen_handle);
                 /* remove any peers that sprung up while I failed to
                  * advertise myself */
-                koibnal_del_peer (PTL_NID_ANY, 0);
+                kibnal_del_peer (PTL_NID_ANY, 0);
         }
-
-        up (&koibnal_data.koib_nid_mutex);
-        return (0);
+        
+        kibnal_data.kib_nid = PTL_NID_ANY;
+        up (&kibnal_data.kib_nid_mutex);
+        return (rc);
 }
 
-koib_peer_t *
-koibnal_create_peer (ptl_nid_t nid)
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
 {
-        koib_peer_t *peer;
+        kib_peer_t *peer;
 
         LASSERT (nid != PTL_NID_ANY);
 
@@ -320,20 +336,20 @@ koibnal_create_peer (ptl_nid_t nid)
         INIT_LIST_HEAD (&peer->ibp_tx_queue);
 
         peer->ibp_reconnect_time = jiffies;
-        peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
+        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
 
-        atomic_inc (&koibnal_data.koib_npeers);
+        atomic_inc (&kibnal_data.kib_npeers);
         return (peer);
 }
 
 void
-koibnal_destroy_peer (koib_peer_t *peer)
+kibnal_destroy_peer (kib_peer_t *peer)
 {
         CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
 
         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
         LASSERT (peer->ibp_persistence == 0);
-        LASSERT (!koibnal_peer_active(peer));
+        LASSERT (!kibnal_peer_active(peer));
         LASSERT (peer->ibp_connecting == 0);
         LASSERT (list_empty (&peer->ibp_conns));
         LASSERT (list_empty (&peer->ibp_tx_queue));
@@ -344,11 +360,11 @@ koibnal_destroy_peer (koib_peer_t *peer)
          * they are destroyed, so we can be assured that _all_ state to do
          * with this peer has been cleaned up when its refcount drops to
          * zero. */
-        atomic_dec (&koibnal_data.koib_npeers);
+        atomic_dec (&kibnal_data.kib_npeers);
 }
 
 void
-koibnal_put_peer (koib_peer_t *peer)
+kibnal_put_peer (kib_peer_t *peer)
 {
         CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
                 peer, peer->ibp_nid,
@@ -358,19 +374,19 @@ koibnal_put_peer (koib_peer_t *peer)
         if (!atomic_dec_and_test (&peer->ibp_refcount))
                 return;
 
-        koibnal_destroy_peer (peer);
+        kibnal_destroy_peer (peer);
 }
 
-koib_peer_t *
-koibnal_find_peer_locked (ptl_nid_t nid)
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
 {
-        struct list_head *peer_list = koibnal_nid2peerlist (nid);
+        struct list_head *peer_list = kibnal_nid2peerlist (nid);
         struct list_head *tmp;
-        koib_peer_t      *peer;
+        kib_peer_t       *peer;
 
         list_for_each (tmp, peer_list) {
 
-                peer = list_entry (tmp, koib_peer_t, ibp_list);
+                peer = list_entry (tmp, kib_peer_t, ibp_list);
 
                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
                          peer->ibp_connecting != 0 || /* creating conns */
@@ -386,46 +402,46 @@ koibnal_find_peer_locked (ptl_nid_t nid)
         return (NULL);
 }
 
-koib_peer_t *
-koibnal_get_peer (ptl_nid_t nid)
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
 {
-        koib_peer_t     *peer;
+        kib_peer_t     *peer;
 
-        read_lock (&koibnal_data.koib_global_lock);
-        peer = koibnal_find_peer_locked (nid);
+        read_lock (&kibnal_data.kib_global_lock);
+        peer = kibnal_find_peer_locked (nid);
         if (peer != NULL)                       /* +1 ref for caller? */
                 atomic_inc (&peer->ibp_refcount);
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
 
         return (peer);
 }
 
 void
-koibnal_unlink_peer_locked (koib_peer_t *peer)
+kibnal_unlink_peer_locked (kib_peer_t *peer)
 {
         LASSERT (peer->ibp_persistence == 0);
         LASSERT (list_empty(&peer->ibp_conns));
 
-        LASSERT (koibnal_peer_active(peer));
+        LASSERT (kibnal_peer_active(peer));
         list_del_init (&peer->ibp_list);
         /* lose peerlist's ref */
-        koibnal_put_peer (peer);
+        kibnal_put_peer (peer);
 }
 
 int
-koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
 {
-        koib_peer_t       *peer;
+        kib_peer_t        *peer;
         struct list_head  *ptmp;
         int                i;
 
-        read_lock (&koibnal_data.koib_global_lock);
+        read_lock (&kibnal_data.kib_global_lock);
 
-        for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
 
-                list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
                         
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -436,53 +452,53 @@ koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
                         *nidp = peer->ibp_nid;
                         *persistencep = peer->ibp_persistence;
                         
-                        read_unlock (&koibnal_data.koib_global_lock);
+                        read_unlock (&kibnal_data.kib_global_lock);
                         return (0);
                 }
         }
 
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
         return (-ENOENT);
 }
 
 int
-koibnal_add_persistent_peer (ptl_nid_t nid)
+kibnal_add_persistent_peer (ptl_nid_t nid)
 {
         unsigned long      flags;
-        koib_peer_t       *peer;
-        koib_peer_t       *peer2;
+        kib_peer_t        *peer;
+        kib_peer_t        *peer2;
         
         if (nid == PTL_NID_ANY)
                 return (-EINVAL);
 
-        peer = koibnal_create_peer (nid);
+        peer = kibnal_create_peer (nid);
         if (peer == NULL)
                 return (-ENOMEM);
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        peer2 = koibnal_find_peer_locked (nid);
+        peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
-                koibnal_put_peer (peer);
+                kibnal_put_peer (peer);
                 peer = peer2;
         } else {
                 /* peer table takes existing ref on peer */
                 list_add_tail (&peer->ibp_list,
-                               koibnal_nid2peerlist (nid));
+                               kibnal_nid2peerlist (nid));
         }
 
         peer->ibp_persistence++;
         
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
         return (0);
 }
 
 void
-koibnal_del_peer_locked (koib_peer_t *peer, int single_share)
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
 {
         struct list_head *ctmp;
         struct list_head *cnxt;
-        koib_conn_t      *conn;
+        kib_conn_t       *conn;
 
         if (!single_share)
                 peer->ibp_persistence = 0;
@@ -493,38 +509,38 @@ koibnal_del_peer_locked (koib_peer_t *peer, int single_share)
                 return;
 
         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
-                conn = list_entry(ctmp, koib_conn_t, ibc_list);
+                conn = list_entry(ctmp, kib_conn_t, ibc_list);
 
-                koibnal_close_conn_locked (conn, 0);
+                kibnal_close_conn_locked (conn, 0);
         }
 
         /* NB peer unlinks itself when last conn is closed */
 }
 
 int
-koibnal_del_peer (ptl_nid_t nid, int single_share)
+kibnal_del_peer (ptl_nid_t nid, int single_share)
 {
         unsigned long      flags;
         struct list_head  *ptmp;
         struct list_head  *pnxt;
-        koib_peer_t      *peer;
+        kib_peer_t        *peer;
         int                lo;
         int                hi;
         int                i;
         int                rc = -ENOENT;
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         if (nid != PTL_NID_ANY)
-                lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
-                hi = koibnal_data.koib_peer_hash_size - 1;
+                hi = kibnal_data.kib_peer_hash_size - 1;
         }
 
         for (i = lo; i <= hi; i++) {
-                list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -532,7 +548,7 @@ koibnal_del_peer (ptl_nid_t nid, int single_share)
                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
                                 continue;
 
-                        koibnal_del_peer_locked (peer, single_share);
+                        kibnal_del_peer_locked (peer, single_share);
                         rc = 0;         /* matched something */
 
                         if (single_share)
@@ -540,26 +556,26 @@ koibnal_del_peer (ptl_nid_t nid, int single_share)
                 }
         }
  out:
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         return (rc);
 }
 
-koib_conn_t *
-koibnal_get_conn_by_idx (int index)
+kib_conn_t *
+kibnal_get_conn_by_idx (int index)
 {
-        koib_peer_t       *peer;
+        kib_peer_t        *peer;
         struct list_head  *ptmp;
-        koib_conn_t       *conn;
+        kib_conn_t        *conn;
         struct list_head  *ctmp;
         int                i;
 
-        read_lock (&koibnal_data.koib_global_lock);
+        read_lock (&kibnal_data.kib_global_lock);
 
-        for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
-                list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
 
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence > 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -568,25 +584,25 @@ koibnal_get_conn_by_idx (int index)
                                 if (index-- > 0)
                                         continue;
 
-                                conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                                conn = list_entry (ctmp, kib_conn_t, ibc_list);
                                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                                        atomic_read (&conn->ibc_refcount));
                                 atomic_inc (&conn->ibc_refcount);
-                                read_unlock (&koibnal_data.koib_global_lock);
+                                read_unlock (&kibnal_data.kib_global_lock);
                                 return (conn);
                         }
                 }
         }
 
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
         return (NULL);
 }
 
-koib_conn_t *
-koibnal_create_conn (void)
+kib_conn_t *
+kibnal_create_conn (void)
 {
-        koib_conn_t *conn;
+        kib_conn_t  *conn;
         int          i;
         __u64        vaddr = 0;
         __u64        vaddr_base;
@@ -608,57 +624,57 @@ koibnal_create_conn (void)
         memset (conn, 0, sizeof (*conn));
 
         INIT_LIST_HEAD (&conn->ibc_tx_queue);
-        INIT_LIST_HEAD (&conn->ibc_rdma_queue);
+        INIT_LIST_HEAD (&conn->ibc_active_txs);
         spin_lock_init (&conn->ibc_lock);
         
-        atomic_inc (&koibnal_data.koib_nconns);
+        atomic_inc (&kibnal_data.kib_nconns);
         /* well not really, but I call destroy() on failure, which decrements */
 
-        PORTAL_ALLOC (conn->ibc_rxs, OPENIBNAL_RX_MSGS * sizeof (koib_rx_t));
+        PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
         if (conn->ibc_rxs == NULL)
                 goto failed;
-        memset (conn->ibc_rxs, 0, OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
+        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
-        rc = koibnal_alloc_pages(&conn->ibc_rx_pages,
-                                 OPENIBNAL_RX_MSG_PAGES,
-                                 IB_ACCESS_LOCAL_WRITE);
+        rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
+                                IBNAL_RX_MSG_PAGES,
+                                IB_ACCESS_LOCAL_WRITE);
         if (rc != 0)
                 goto failed;
 
-        vaddr_base = vaddr = conn->ibc_rx_pages->oibp_vaddr;
+        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
 
-        for (i = ipage = page_offset = 0; i < OPENIBNAL_RX_MSGS; i++) {
-                struct page *page = conn->ibc_rx_pages->oibp_pages[ipage];
-                koib_rx_t   *rx = &conn->ibc_rxs[i];
+        for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+                struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+                kib_rx_t   *rx = &conn->ibc_rxs[i];
 
                 rx->rx_conn = conn;
                 rx->rx_vaddr = vaddr;
-                rx->rx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
+                rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
                 
-                vaddr += OPENIBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + OPENIBNAL_RX_MSG_BYTES);
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
                 
-                page_offset += OPENIBNAL_MSG_SIZE;
+                page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= OPENIBNAL_RX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
                 }
         }
 
         params.qp_create = (struct ib_qp_create_param) {
                 .limit = {
                         /* Sends have an optional RDMA */
-                        .max_outstanding_send_request    = 2 * OPENIBNAL_MSG_QUEUE_SIZE,
-                        .max_outstanding_receive_request = OPENIBNAL_MSG_QUEUE_SIZE,
+                        .max_outstanding_send_request    = 2 * IBNAL_MSG_QUEUE_SIZE,
+                        .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
                         .max_send_gather_element         = 1,
                         .max_receive_scatter_element     = 1,
                 },
-                .pd              = koibnal_data.koib_pd,
-                .send_queue      = koibnal_data.koib_tx_cq,
-                .receive_queue   = koibnal_data.koib_rx_cq,
+                .pd              = kibnal_data.kib_pd,
+                .send_queue      = kibnal_data.kib_cq,
+                .receive_queue   = kibnal_data.kib_cq,
                 .send_policy     = IB_WQ_SIGNAL_SELECTABLE,
                 .receive_policy  = IB_WQ_SIGNAL_SELECTABLE,
                 .rd_domain       = 0,
@@ -673,11 +689,11 @@ koibnal_create_conn (void)
         }
         
         /* Mark QP created */
-        conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
+        conn->ibc_state = IBNAL_CONN_INIT_QP;
 
         params.qp_attr = (struct ib_qp_attribute) {
                 .state             = IB_QP_STATE_INIT,
-                .port              = koibnal_data.koib_port,
+                .port              = kibnal_data.kib_port,
                 .enable_rdma_read  = 1,
                 .enable_rdma_write = 1,
                 .valid_fields      = (IB_QP_ATTRIBUTE_STATE |
@@ -696,12 +712,12 @@ koibnal_create_conn (void)
         return (conn);
         
  failed:
-        koibnal_destroy_conn (conn);
+        kibnal_destroy_conn (conn);
         return (NULL);
 }
 
 void
-koibnal_destroy_conn (koib_conn_t *conn)
+kibnal_destroy_conn (kib_conn_t *conn)
 {
         int    rc;
         
@@ -709,21 +725,21 @@ koibnal_destroy_conn (koib_conn_t *conn)
 
         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
         LASSERT (list_empty(&conn->ibc_tx_queue));
-        LASSERT (list_empty(&conn->ibc_rdma_queue));
+        LASSERT (list_empty(&conn->ibc_active_txs));
         LASSERT (conn->ibc_nsends_posted == 0);
         LASSERT (conn->ibc_connreq == NULL);
 
         switch (conn->ibc_state) {
-        case OPENIBNAL_CONN_ZOMBIE:
+        case IBNAL_CONN_ZOMBIE:
                 /* called after connection sequence initiated */
 
-        case OPENIBNAL_CONN_INIT_QP:
+        case IBNAL_CONN_INIT_QP:
                 rc = ib_qp_destroy(conn->ibc_qp);
                 if (rc != 0)
                         CERROR("Can't destroy QP: %d\n", rc);
                 /* fall through */
                 
-        case OPENIBNAL_CONN_INIT_NOTHING:
+        case IBNAL_CONN_INIT_NOTHING:
                 break;
 
         default:
@@ -731,30 +747,30 @@ koibnal_destroy_conn (koib_conn_t *conn)
         }
 
         if (conn->ibc_rx_pages != NULL) 
-                koibnal_free_pages(conn->ibc_rx_pages);
+                kibnal_free_pages(conn->ibc_rx_pages);
         
         if (conn->ibc_rxs != NULL)
                 PORTAL_FREE(conn->ibc_rxs, 
-                            OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
+                            IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
         if (conn->ibc_peer != NULL)
-                koibnal_put_peer(conn->ibc_peer);
+                kibnal_put_peer(conn->ibc_peer);
 
         PORTAL_FREE(conn, sizeof (*conn));
 
-        atomic_dec(&koibnal_data.koib_nconns);
+        atomic_dec(&kibnal_data.kib_nconns);
         
-        if (atomic_read (&koibnal_data.koib_nconns) == 0 &&
-            koibnal_data.koib_shutdown) {
+        if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+            kibnal_data.kib_shutdown) {
                 /* I just nuked the last connection on shutdown; wake up
                  * everyone so they can exit. */
-                wake_up_all(&koibnal_data.koib_sched_waitq);
-                wake_up_all(&koibnal_data.koib_connd_waitq);
+                wake_up_all(&kibnal_data.kib_sched_waitq);
+                wake_up_all(&kibnal_data.kib_connd_waitq);
         }
 }
 
 void
-koibnal_put_conn (koib_conn_t *conn)
+kibnal_put_conn (kib_conn_t *conn)
 {
         unsigned long flags;
 
@@ -767,44 +783,44 @@ koibnal_put_conn (koib_conn_t *conn)
                 return;
 
         /* last ref only goes on zombies */
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ZOMBIE);
+        LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
 
-        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
 
-        list_add (&conn->ibc_list, &koibnal_data.koib_connd_conns);
-        wake_up (&koibnal_data.koib_connd_waitq);
+        list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
 
-        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 }
 
 int
-koibnal_close_peer_conns_locked (koib_peer_t *peer, int why)
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
 {
-        koib_conn_t        *conn;
+        kib_conn_t         *conn;
         struct list_head   *ctmp;
         struct list_head   *cnxt;
         int                 count = 0;
 
         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
-                conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
 
                 count++;
-                koibnal_close_conn_locked (conn, why);
+                kibnal_close_conn_locked (conn, why);
         }
 
         return (count);
 }
 
 int
-koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation)
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
 {
-        koib_conn_t        *conn;
+        kib_conn_t         *conn;
         struct list_head   *ctmp;
         struct list_head   *cnxt;
         int                 count = 0;
 
         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
-                conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
 
                 if (conn->ibc_incarnation == incarnation)
                         continue;
@@ -813,17 +829,17 @@ koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation)
                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
                 
                 count++;
-                koibnal_close_conn_locked (conn, -ESTALE);
+                kibnal_close_conn_locked (conn, -ESTALE);
         }
 
         return (count);
 }
 
 int
-koibnal_close_matching_conns (ptl_nid_t nid)
+kibnal_close_matching_conns (ptl_nid_t nid)
 {
         unsigned long       flags;
-        koib_peer_t        *peer;
+        kib_peer_t         *peer;
         struct list_head   *ptmp;
         struct list_head   *pnxt;
         int                 lo;
@@ -831,19 +847,19 @@ koibnal_close_matching_conns (ptl_nid_t nid)
         int                 i;
         int                 count = 0;
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         if (nid != PTL_NID_ANY)
-                lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
-                hi = koibnal_data.koib_peer_hash_size - 1;
+                hi = kibnal_data.kib_peer_hash_size - 1;
         }
 
         for (i = lo; i <= hi; i++) {
-                list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
 
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -851,11 +867,11 @@ koibnal_close_matching_conns (ptl_nid_t nid)
                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
                                 continue;
 
-                        count += koibnal_close_peer_conns_locked (peer, 0);
+                        count += kibnal_close_peer_conns_locked (peer, 0);
                 }
         }
 
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         /* wildcards always succeed */
         if (nid == PTL_NID_ANY)
@@ -865,7 +881,7 @@ koibnal_close_matching_conns (ptl_nid_t nid)
 }
 
 int
-koibnal_cmd(struct portals_cfg *pcfg, void * private)
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
 {
         int rc = -EINVAL;
 
@@ -876,8 +892,8 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
                 ptl_nid_t   nid = 0;
                 int         share_count = 0;
 
-                rc = koibnal_get_peer_info(pcfg->pcfg_count,
-                                           &nid, &share_count);
+                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                                          &nid, &share_count);
                 pcfg->pcfg_nid   = nid;
                 pcfg->pcfg_size  = 0;
                 pcfg->pcfg_id    = 0;
@@ -887,17 +903,17 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
                 break;
         }
         case NAL_CMD_ADD_PEER: {
-                rc = koibnal_add_persistent_peer (pcfg->pcfg_nid);
+                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
                 break;
         }
         case NAL_CMD_DEL_PEER: {
-                rc = koibnal_del_peer (pcfg->pcfg_nid, 
+                rc = kibnal_del_peer (pcfg->pcfg_nid, 
                                        /* flags == single_share */
                                        pcfg->pcfg_flags != 0);
                 break;
         }
         case NAL_CMD_GET_CONN: {
-                koib_conn_t *conn = koibnal_get_conn_by_idx (pcfg->pcfg_count);
+                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
 
                 if (conn == NULL)
                         rc = -ENOENT;
@@ -907,19 +923,19 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
                         pcfg->pcfg_id    = 0;
                         pcfg->pcfg_misc  = 0;
                         pcfg->pcfg_flags = 0;
-                        koibnal_put_conn (conn);
+                        kibnal_put_conn (conn);
                 }
                 break;
         }
         case NAL_CMD_CLOSE_CONNECTION: {
-                rc = koibnal_close_matching_conns (pcfg->pcfg_nid);
+                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
                 break;
         }
         case NAL_CMD_REGISTER_MYNID: {
                 if (pcfg->pcfg_nid == PTL_NID_ANY)
                         rc = -EINVAL;
                 else
-                        rc = koibnal_set_mynid (pcfg->pcfg_nid);
+                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
                 break;
         }
         }
@@ -928,47 +944,47 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
 }
 
 void
-koibnal_free_pages (koib_pages_t *p)
+kibnal_free_pages (kib_pages_t *p)
 {
-        int     npages = p->oibp_npages;
+        int     npages = p->ibp_npages;
         int     rc;
         int     i;
         
-        if (p->oibp_mapped) {
-                rc = ib_memory_deregister(p->oibp_handle);
+        if (p->ibp_mapped) {
+                rc = ib_memory_deregister(p->ibp_handle);
                 if (rc != 0)
                         CERROR ("Deregister error: %d\n", rc);
         }
         
         for (i = 0; i < npages; i++)
-                if (p->oibp_pages[i] != NULL)
-                        __free_page(p->oibp_pages[i]);
+                if (p->ibp_pages[i] != NULL)
+                        __free_page(p->ibp_pages[i]);
         
-        PORTAL_FREE (p, offsetof(koib_pages_t, oibp_pages[npages]));
+        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
 }
 
 int
-koibnal_alloc_pages (koib_pages_t **pp, int npages, int access)
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
 {
-        koib_pages_t               *p;
+        kib_pages_t                *p;
         struct ib_physical_buffer  *phys_pages;
         int                         i;
         int                         rc;
 
-        PORTAL_ALLOC(p, offsetof(koib_pages_t, oibp_pages[npages]));
+        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
                 CERROR ("Can't allocate buffer %d\n", npages);
                 return (-ENOMEM);
         }
 
-        memset (p, 0, offsetof(koib_pages_t, oibp_pages[npages]));
-        p->oibp_npages = npages;
+        memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+        p->ibp_npages = npages;
         
         for (i = 0; i < npages; i++) {
-                p->oibp_pages[i] = alloc_page (GFP_KERNEL);
-                if (p->oibp_pages[i] == NULL) {
+                p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+                if (p->ibp_pages[i] == NULL) {
                         CERROR ("Can't allocate page %d of %d\n", i, npages);
-                        koibnal_free_pages(p);
+                        kibnal_free_pages(p);
                         return (-ENOMEM);
                 }
         }
@@ -976,96 +992,96 @@ koibnal_alloc_pages (koib_pages_t **pp, int npages, int access)
         PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
         if (phys_pages == NULL) {
                 CERROR ("Can't allocate physarray for %d pages\n", npages);
-                koibnal_free_pages(p);
+                kibnal_free_pages(p);
                 return (-ENOMEM);
         }
 
         for (i = 0; i < npages; i++) {
                 phys_pages[i].size = PAGE_SIZE;
                 phys_pages[i].address =
-                        koibnal_page2phys(p->oibp_pages[i]);
+                        kibnal_page2phys(p->ibp_pages[i]);
         }
 
-        p->oibp_vaddr = 0;
-        rc = ib_memory_register_physical(koibnal_data.koib_pd,
+        p->ibp_vaddr = 0;
+        rc = ib_memory_register_physical(kibnal_data.kib_pd,
                                          phys_pages, npages,
-                                         &p->oibp_vaddr,
+                                         &p->ibp_vaddr,
                                          npages * PAGE_SIZE, 0,
                                          access,
-                                         &p->oibp_handle,
-                                         &p->oibp_lkey,
-                                         &p->oibp_rkey);
+                                         &p->ibp_handle,
+                                         &p->ibp_lkey,
+                                         &p->ibp_rkey);
         
         PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
         
         if (rc != 0) {
                 CERROR ("Error %d mapping %d pages\n", rc, npages);
-                koibnal_free_pages(p);
+                kibnal_free_pages(p);
                 return (rc);
         }
         
-        p->oibp_mapped = 1;
+        p->ibp_mapped = 1;
         *pp = p;
         return (0);
 }
 
 int
-koibnal_setup_tx_descs (void)
+kibnal_setup_tx_descs (void)
 {
         int           ipage = 0;
         int           page_offset = 0;
         __u64         vaddr;
         __u64         vaddr_base;
         struct page  *page;
-        koib_tx_t    *tx;
+        kib_tx_t     *tx;
         int           i;
         int           rc;
 
         /* pre-mapped messages are not bigger than 1 page */
-        LASSERT (OPENIBNAL_MSG_SIZE <= PAGE_SIZE);
+        LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
 
         /* No fancy arithmetic when we do the buffer calculations */
-        LASSERT (PAGE_SIZE % OPENIBNAL_MSG_SIZE == 0);
+        LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
 
-        rc = koibnal_alloc_pages(&koibnal_data.koib_tx_pages,
-                                 OPENIBNAL_TX_MSG_PAGES, 
-                                 0);            /* local read access only */
+        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
+                                IBNAL_TX_MSG_PAGES, 
+                                0);            /* local read access only */
         if (rc != 0)
                 return (rc);
 
-        vaddr = vaddr_base = koibnal_data.koib_tx_pages->oibp_vaddr;
+        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
 
-        for (i = 0; i < OPENIBNAL_TX_MSGS; i++) {
-                page = koibnal_data.koib_tx_pages->oibp_pages[ipage];
-                tx = &koibnal_data.koib_tx_descs[i];
+        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+                page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+                tx = &kibnal_data.kib_tx_descs[i];
 
                 memset (tx, 0, sizeof(*tx));    /* zero flags etc */
                 
-                tx->tx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
+                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
                 tx->tx_vaddr = vaddr;
-                tx->tx_isnblk = (i >= OPENIBNAL_NTX);
-                tx->tx_mapped = KOIB_TX_UNMAPPED;
+                tx->tx_isnblk = (i >= IBNAL_NTX);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
 
                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
                        i, tx, tx->tx_msg, tx->tx_vaddr);
 
                 if (tx->tx_isnblk)
                         list_add (&tx->tx_list, 
-                                  &koibnal_data.koib_idle_nblk_txs);
+                                  &kibnal_data.kib_idle_nblk_txs);
                 else
                         list_add (&tx->tx_list, 
-                                  &koibnal_data.koib_idle_txs);
+                                  &kibnal_data.kib_idle_txs);
 
-                vaddr += OPENIBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + OPENIBNAL_TX_MSG_BYTES);
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
 
-                page_offset += OPENIBNAL_MSG_SIZE;
+                page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= OPENIBNAL_TX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
                 }
         }
         
@@ -1073,7 +1089,7 @@ koibnal_setup_tx_descs (void)
 }
 
 void
-koibnal_api_shutdown (nal_t *nal)
+kibnal_api_shutdown (nal_t *nal)
 {
         int   i;
         int   rc;
@@ -1087,119 +1103,113 @@ koibnal_api_shutdown (nal_t *nal)
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
                atomic_read (&portal_kmemory));
 
-        LASSERT(nal == &koibnal_api);
+        LASSERT(nal == &kibnal_api);
 
-        switch (koibnal_data.koib_init) {
+        switch (kibnal_data.kib_init) {
         default:
-                CERROR ("Unexpected state %d\n", koibnal_data.koib_init);
+                CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
                 LBUG();
 
-        case OPENIBNAL_INIT_ALL:
+        case IBNAL_INIT_ALL:
                 /* stop calls to nal_cmd */
                 libcfs_nal_cmd_unregister(OPENIBNAL);
                 /* No new peers */
 
                 /* resetting my NID to unadvertises me, removes my
                  * listener and nukes all current peers */
-                koibnal_set_mynid (PTL_NID_ANY);
+                kibnal_set_mynid (PTL_NID_ANY);
 
                 /* Wait for all peer state to clean up */
                 i = 2;
-                while (atomic_read (&koibnal_data.koib_npeers) != 0) {
+                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "waiting for %d peers to close down\n",
-                               atomic_read (&koibnal_data.koib_npeers));
+                               atomic_read (&kibnal_data.kib_npeers));
                         set_current_state (TASK_INTERRUPTIBLE);
                         schedule_timeout (HZ);
                 }
                 /* fall through */
 
-        case OPENIBNAL_INIT_TX_CQ:
-                rc = ib_cq_destroy (koibnal_data.koib_tx_cq);
-                if (rc != 0)
-                        CERROR ("Destroy tx CQ error: %d\n", rc);
-                /* fall through */
-
-        case OPENIBNAL_INIT_RX_CQ:
-                rc = ib_cq_destroy (koibnal_data.koib_rx_cq);
+        case IBNAL_INIT_CQ:
+                rc = ib_cq_destroy (kibnal_data.kib_cq);
                 if (rc != 0)
-                        CERROR ("Destroy rx CQ error: %d\n", rc);
+                        CERROR ("Destroy CQ error: %d\n", rc);
                 /* fall through */
 
-        case OPENIBNAL_INIT_TXD:
-                koibnal_free_pages (koibnal_data.koib_tx_pages);
+        case IBNAL_INIT_TXD:
+                kibnal_free_pages (kibnal_data.kib_tx_pages);
                 /* fall through */
-#if OPENIBNAL_FMR
-        case OPENIBNAL_INIT_FMR:
-                rc = ib_fmr_pool_destroy (koibnal_data.koib_fmr_pool);
+#if IBNAL_FMR
+        case IBNAL_INIT_FMR:
+                rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
                 if (rc != 0)
                         CERROR ("Destroy FMR pool error: %d\n", rc);
                 /* fall through */
 #endif
-        case OPENIBNAL_INIT_PD:
-                rc = ib_pd_destroy(koibnal_data.koib_pd);
+        case IBNAL_INIT_PD:
+                rc = ib_pd_destroy(kibnal_data.kib_pd);
                 if (rc != 0)
                         CERROR ("Destroy PD error: %d\n", rc);
                 /* fall through */
 
-        case OPENIBNAL_INIT_LIB:
-                lib_fini(&koibnal_lib);
+        case IBNAL_INIT_LIB:
+                lib_fini(&kibnal_lib);
                 /* fall through */
 
-        case OPENIBNAL_INIT_DATA:
+        case IBNAL_INIT_DATA:
                 /* Module refcount only gets to zero when all peers
                  * have been closed so all lists must be empty */
-                LASSERT (atomic_read (&koibnal_data.koib_npeers) == 0);
-                LASSERT (koibnal_data.koib_peers != NULL);
-                for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
-                        LASSERT (list_empty (&koibnal_data.koib_peers[i]));
+                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (kibnal_data.kib_peers != NULL);
+                for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                        LASSERT (list_empty (&kibnal_data.kib_peers[i]));
                 }
-                LASSERT (atomic_read (&koibnal_data.koib_nconns) == 0);
-                LASSERT (list_empty (&koibnal_data.koib_sched_rxq));
-                LASSERT (list_empty (&koibnal_data.koib_sched_txq));
-                LASSERT (list_empty (&koibnal_data.koib_connd_conns));
-                LASSERT (list_empty (&koibnal_data.koib_connd_peers));
+                LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+                LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+                LASSERT (list_empty (&kibnal_data.kib_connd_peers));
 
                 /* flag threads to terminate; wake and wait for them to die */
-                koibnal_data.koib_shutdown = 1;
-                wake_up_all (&koibnal_data.koib_sched_waitq);
-                wake_up_all (&koibnal_data.koib_connd_waitq);
+                kibnal_data.kib_shutdown = 1;
+                wake_up_all (&kibnal_data.kib_sched_waitq);
+                wake_up_all (&kibnal_data.kib_connd_waitq);
 
                 i = 2;
-                while (atomic_read (&koibnal_data.koib_nthreads) != 0) {
+                while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "Waiting for %d threads to terminate\n",
-                               atomic_read (&koibnal_data.koib_nthreads));
+                               atomic_read (&kibnal_data.kib_nthreads));
                         set_current_state (TASK_INTERRUPTIBLE);
                         schedule_timeout (HZ);
                 }
                 /* fall through */
                 
-        case OPENIBNAL_INIT_NOTHING:
+        case IBNAL_INIT_NOTHING:
                 break;
         }
 
-        if (koibnal_data.koib_tx_descs != NULL)
-                PORTAL_FREE (koibnal_data.koib_tx_descs,
-                             OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
+        if (kibnal_data.kib_tx_descs != NULL)
+                PORTAL_FREE (kibnal_data.kib_tx_descs,
+                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
 
-        if (koibnal_data.koib_peers != NULL)
-                PORTAL_FREE (koibnal_data.koib_peers,
+        if (kibnal_data.kib_peers != NULL)
+                PORTAL_FREE (kibnal_data.kib_peers,
                              sizeof (struct list_head) * 
-                             koibnal_data.koib_peer_hash_size);
+                             kibnal_data.kib_peer_hash_size);
 
         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
                atomic_read (&portal_kmemory));
         printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
                atomic_read(&portal_kmemory));
 
-        koibnal_data.koib_init = OPENIBNAL_INIT_NOTHING;
+        kibnal_data.kib_init = IBNAL_INIT_NOTHING;
 }
 
 int
-koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                      ptl_ni_limits_t *requested_limits,
                      ptl_ni_limits_t *actual_limits)
 {
@@ -1208,65 +1218,66 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         int               rc;
         int               i;
 
-        LASSERT (nal == &koibnal_api);
+        LASSERT (nal == &kibnal_api);
 
         if (nal->nal_refct != 0) {
                 if (actual_limits != NULL)
-                        *actual_limits = koibnal_lib.libnal_ni.ni_actual_limits;
+                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
                 /* This module got the first ref */
                 PORTAL_MODULE_USE;
                 return (PTL_OK);
         }
 
-        LASSERT (koibnal_data.koib_init == OPENIBNAL_INIT_NOTHING);
+        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
 
-        memset (&koibnal_data, 0, sizeof (koibnal_data)); /* zero pointers, flags etc */
+        memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
 
-        init_MUTEX (&koibnal_data.koib_nid_mutex);
-        init_MUTEX_LOCKED (&koibnal_data.koib_nid_signal);
-        koibnal_data.koib_nid = PTL_NID_ANY;
+        init_MUTEX (&kibnal_data.kib_nid_mutex);
+        init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+        kibnal_data.kib_nid = PTL_NID_ANY;
 
-        rwlock_init(&koibnal_data.koib_global_lock);
+        rwlock_init(&kibnal_data.kib_global_lock);
 
-        koibnal_data.koib_peer_hash_size = OPENIBNAL_PEER_HASH_SIZE;
-        PORTAL_ALLOC (koibnal_data.koib_peers,
-                      sizeof (struct list_head) * koibnal_data.koib_peer_hash_size);
-        if (koibnal_data.koib_peers == NULL) {
+        kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+        PORTAL_ALLOC (kibnal_data.kib_peers,
+                      sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+        if (kibnal_data.kib_peers == NULL) {
                 goto failed;
         }
-        for (i = 0; i < koibnal_data.koib_peer_hash_size; i++)
-                INIT_LIST_HEAD(&koibnal_data.koib_peers[i]);
-
-        spin_lock_init (&koibnal_data.koib_connd_lock);
-        INIT_LIST_HEAD (&koibnal_data.koib_connd_peers);
-        INIT_LIST_HEAD (&koibnal_data.koib_connd_conns);
-        init_waitqueue_head (&koibnal_data.koib_connd_waitq);
-
-        spin_lock_init (&koibnal_data.koib_sched_lock);
-        INIT_LIST_HEAD (&koibnal_data.koib_sched_txq);
-        INIT_LIST_HEAD (&koibnal_data.koib_sched_rxq);
-        init_waitqueue_head (&koibnal_data.koib_sched_waitq);
-
-        spin_lock_init (&koibnal_data.koib_tx_lock);
-        INIT_LIST_HEAD (&koibnal_data.koib_idle_txs);
-        INIT_LIST_HEAD (&koibnal_data.koib_idle_nblk_txs);
-        init_waitqueue_head(&koibnal_data.koib_idle_tx_waitq);
-
-        PORTAL_ALLOC (koibnal_data.koib_tx_descs,
-                      OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
-        if (koibnal_data.koib_tx_descs == NULL) {
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+                INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+        spin_lock_init (&kibnal_data.kib_connd_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+        init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+        spin_lock_init (&kibnal_data.kib_sched_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+        init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+        spin_lock_init (&kibnal_data.kib_tx_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        if (kibnal_data.kib_tx_descs == NULL) {
                 CERROR ("Can't allocate tx descs\n");
                 goto failed;
         }
 
         /* lists/ptrs/locks initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_DATA;
+        kibnal_data.kib_init = IBNAL_INIT_DATA;
         /*****************************************************/
 
+
         process_id.pid = requested_pid;
-        process_id.nid = koibnal_data.koib_nid;
+        process_id.nid = kibnal_data.kib_nid;
         
-        rc = lib_init(&koibnal_lib, nal, process_id,
+        rc = lib_init(&kibnal_lib, nal, process_id,
                       requested_limits, actual_limits);
         if (rc != PTL_OK) {
                 CERROR("lib_init failed: error %d\n", rc);
@@ -1274,11 +1285,11 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         }
 
         /* lib interface initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_LIB;
+        kibnal_data.kib_init = IBNAL_INIT_LIB;
         /*****************************************************/
 
-        for (i = 0; i < OPENIBNAL_N_SCHED; i++) {
-                rc = koibnal_thread_start (koibnal_scheduler, (void *)i);
+        for (i = 0; i < IBNAL_N_SCHED; i++) {
+                rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
                 if (rc != 0) {
                         CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
                                i, rc);
@@ -1286,56 +1297,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 }
         }
 
-        rc = koibnal_thread_start (koibnal_connd, NULL);
+        rc = kibnal_thread_start (kibnal_connd, NULL);
         if (rc != 0) {
                 CERROR ("Can't spawn openibnal connd: %d\n", rc);
                 goto failed;
         }
 
-        koibnal_data.koib_device = ib_device_get_by_index(0);
-        if (koibnal_data.koib_device == NULL) {
+        kibnal_data.kib_device = ib_device_get_by_index(0);
+        if (kibnal_data.kib_device == NULL) {
                 CERROR ("Can't open ib device 0\n");
                 goto failed;
         }
         
-        rc = ib_device_properties_get(koibnal_data.koib_device,
-                                      &koibnal_data.koib_device_props);
+        rc = ib_device_properties_get(kibnal_data.kib_device,
+                                      &kibnal_data.kib_device_props);
         if (rc != 0) {
                 CERROR ("Can't get device props: %d\n", rc);
                 goto failed;
         }
 
         CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n", 
-               koibnal_data.koib_device_props.max_initiator_per_qp,
-               koibnal_data.koib_device_props.max_responder_per_qp);
+               kibnal_data.kib_device_props.max_initiator_per_qp,
+               kibnal_data.kib_device_props.max_responder_per_qp);
 
-        koibnal_data.koib_port = 0;
+        kibnal_data.kib_port = 0;
         for (i = 1; i <= 2; i++) {
-                rc = ib_port_properties_get(koibnal_data.koib_device, i,
-                                            &koibnal_data.koib_port_props);
+                rc = ib_port_properties_get(kibnal_data.kib_device, i,
+                                            &kibnal_data.kib_port_props);
                 if (rc == 0) {
-                        koibnal_data.koib_port = i;
+                        kibnal_data.kib_port = i;
                         break;
                 }
         }
-        if (koibnal_data.koib_port == 0) {
+        if (kibnal_data.kib_port == 0) {
                 CERROR ("Can't find a port\n");
                 goto failed;
         }
 
-        rc = ib_pd_create(koibnal_data.koib_device,
-                          NULL, &koibnal_data.koib_pd);
+        rc = ib_pd_create(kibnal_data.kib_device,
+                          NULL, &kibnal_data.kib_pd);
         if (rc != 0) {
                 CERROR ("Can't create PD: %d\n", rc);
                 goto failed;
         }
         
         /* flag PD initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_PD;
+        kibnal_data.kib_init = IBNAL_INIT_PD;
         /*****************************************************/
-#if OPENIBNAL_FMR
+#if IBNAL_FMR
         {
-                const int pool_size = OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK;
+                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
                 struct ib_fmr_pool_param params = {
                         .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
                         .access            = (IB_ACCESS_LOCAL_WRITE |
@@ -1347,8 +1358,8 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                         .flush_arg         = NULL,
                         .cache             = 1,
                 };
-                rc = ib_fmr_pool_create(koibnal_data.koib_pd, &params,
-                                        &koibnal_data.koib_fmr_pool);
+                rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
+                                        &kibnal_data.kib_fmr_pool);
                 if (rc != 0) {
                         CERROR ("Can't create FMR pool size %d: %d\n", 
                                 pool_size, rc);
@@ -1357,84 +1368,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         }
 
         /* flag FMR pool initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_FMR;
+        kibnal_data.kib_init = IBNAL_INIT_FMR;
 #endif
         /*****************************************************/
 
-        rc = koibnal_setup_tx_descs();
+        rc = kibnal_setup_tx_descs();
         if (rc != 0) {
                 CERROR ("Can't register tx descs: %d\n", rc);
                 goto failed;
         }
         
         /* flag TX descs initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_TXD;
+        kibnal_data.kib_init = IBNAL_INIT_TXD;
         /*****************************************************/
         
         {
                 struct ib_cq_callback callback = {
-                        .context        = OPENIBNAL_CALLBACK_CTXT,
+                        .context        = IBNAL_CALLBACK_CTXT,
                         .policy         = IB_CQ_PROVIDER_REARM,
                         .function       = {
-                                .entry  = koibnal_rx_callback,
+                                .entry  = kibnal_callback,
                         },
                         .arg            = NULL,
                 };
-                int  nentries = OPENIBNAL_RX_CQ_ENTRIES;
+                int  nentries = IBNAL_CQ_ENTRIES;
                 
-                rc = ib_cq_create (koibnal_data.koib_device, 
+                rc = ib_cq_create (kibnal_data.kib_device, 
                                    &nentries, &callback, NULL,
-                                   &koibnal_data.koib_rx_cq);
+                                   &kibnal_data.kib_cq);
                 if (rc != 0) {
-                        CERROR ("Can't create RX CQ: %d\n", rc);
+                        CERROR ("Can't create CQ: %d\n", rc);
                         goto failed;
                 }
 
                 /* I only want solicited events */
-                rc = ib_cq_request_notification(koibnal_data.koib_rx_cq, 1);
+                rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
                 LASSERT (rc == 0);
         }
         
-        /* flag RX CQ initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_RX_CQ;
-        /*****************************************************/
-
-        {
-                struct ib_cq_callback callback = {
-                        .context        = OPENIBNAL_CALLBACK_CTXT,
-                        .policy         = IB_CQ_PROVIDER_REARM,
-                        .function       = {
-                                .entry  = koibnal_tx_callback,
-                        },
-                        .arg            = NULL,
-                };
-                int  nentries = OPENIBNAL_TX_CQ_ENTRIES;
-                
-                rc = ib_cq_create (koibnal_data.koib_device, 
-                                   &nentries, &callback, NULL,
-                                   &koibnal_data.koib_tx_cq);
-                if (rc != 0) {
-                        CERROR ("Can't create RX CQ: %d\n", rc);
-                        goto failed;
-                }
-
-                /* I only want solicited events */
-                rc = ib_cq_request_notification(koibnal_data.koib_tx_cq, 1);
-                LASSERT (rc == 0);
-        }
-                                   
-        /* flag TX CQ initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_TX_CQ;
+        /* flag CQ initialised */
+        kibnal_data.kib_init = IBNAL_INIT_CQ;
         /*****************************************************/
         
-        rc = libcfs_nal_cmd_register(OPENIBNAL, &koibnal_cmd, NULL);
+        rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
         if (rc != 0) {
                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
                 goto failed;
         }
 
         /* flag everything initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_ALL;
+        kibnal_data.kib_init = IBNAL_INIT_ALL;
         /*****************************************************/
 
         printk(KERN_INFO "Lustre: OpenIB NAL loaded "
@@ -1443,44 +1426,44 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         return (PTL_OK);
 
  failed:
-        koibnal_api_shutdown (&koibnal_api);    
+        kibnal_api_shutdown (&kibnal_api);    
         return (PTL_FAIL);
 }
 
 void __exit
-koibnal_module_fini (void)
+kibnal_module_fini (void)
 {
 #ifdef CONFIG_SYSCTL
-        if (koibnal_tunables.koib_sysctl != NULL)
-                unregister_sysctl_table (koibnal_tunables.koib_sysctl);
+        if (kibnal_tunables.kib_sysctl != NULL)
+                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
 #endif
-        PtlNIFini(koibnal_ni);
+        PtlNIFini(kibnal_ni);
 
         ptl_unregister_nal(OPENIBNAL);
 }
 
 int __init
-koibnal_module_init (void)
+kibnal_module_init (void)
 {
         int    rc;
 
         /* the following must be sizeof(int) for proc_dointvec() */
-        LASSERT(sizeof (koibnal_tunables.koib_io_timeout) == sizeof (int));
+        LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
 
-        koibnal_api.nal_ni_init = koibnal_api_startup;
-        koibnal_api.nal_ni_fini = koibnal_api_shutdown;
+        kibnal_api.nal_ni_init = kibnal_api_startup;
+        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
 
         /* Initialise dynamic tunables to defaults once only */
-        koibnal_tunables.koib_io_timeout = OPENIBNAL_IO_TIMEOUT;
+        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
 
-        rc = ptl_register_nal(OPENIBNAL, &koibnal_api);
+        rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
         if (rc != PTL_OK) {
-                CERROR("Can't register OPENIBNAL: %d\n", rc);
+                CERROR("Can't register IBNAL: %d\n", rc);
                 return (-ENOMEM);               /* or something... */
         }
 
         /* Pure gateways want the NAL started up at module load time... */
-        rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &koibnal_ni);
+        rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
                 ptl_unregister_nal(OPENIBNAL);
                 return (-ENODEV);
@@ -1488,8 +1471,8 @@ koibnal_module_init (void)
         
 #ifdef CONFIG_SYSCTL
         /* Press on regardless even if registering sysctl doesn't work */
-        koibnal_tunables.koib_sysctl = 
-                register_sysctl_table (koibnal_top_ctl_table, 0);
+        kibnal_tunables.kib_sysctl = 
+                register_sysctl_table (kibnal_top_ctl_table, 0);
 #endif
         return (0);
 }
@@ -1498,6 +1481,6 @@ MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
 MODULE_LICENSE("GPL");
 
-module_init(koibnal_module_init);
-module_exit(koibnal_module_fini);
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
 
index 301d3ae..f0610f2 100644 (file)
@@ -48,7 +48,7 @@
 #include <linux/kmod.h>
 #include <linux/sysctl.h>
 
-#define DEBUG_SUBSYSTEM S_OPENIBNAL
+#define DEBUG_SUBSYSTEM S_IBNAL
 
 #include <linux/kp30.h>
 #include <portals/p30.h>
 #include <ts_ib_cm.h>
 #include <ts_ib_sa_client.h>
 
-#define OPENIBNAL_SERVICE_NAME   "openibnal"
+#define IBNAL_SERVICE_NAME   "openibnal"
 
 #if CONFIG_SMP
-# define OPENIBNAL_N_SCHED      num_online_cpus() /* # schedulers */
+# define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
 #else
-# define OPENIBNAL_N_SCHED      1                 /* # schedulers */
+# define IBNAL_N_SCHED      1                   /* # schedulers */
 #endif
 
-#define OPENIBNAL_MIN_RECONNECT_INTERVAL HZ       /* first failed connection retry... */
-#define OPENIBNAL_MAX_RECONNECT_INTERVAL (60*HZ)  /* ...exponentially increasing to this */
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
 
-#define OPENIBNAL_MSG_SIZE       (4<<10)          /* max size of queued messages (inc hdr) */
+#define IBNAL_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
 
-#define OPENIBNAL_MSG_QUEUE_SIZE   8              /* # messages in-flight */
-#define OPENIBNAL_CREDIT_HIGHWATER 6              /* when to eagerly return credits */
-#define OPENIBNAL_RETRY            7              /* # times to retry */
-#define OPENIBNAL_RNR_RETRY        7              /*  */
-#define OPENIBNAL_CM_RETRY         7              /* # times to retry connection */
-#define OPENIBNAL_FLOW_CONTROL     1
-#define OPENIBNAL_RESPONDER_RESOURCES 8
+#define IBNAL_MSG_QUEUE_SIZE   8                /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 6                /* when to eagerly return credits */
+#define IBNAL_RETRY            7                /* # times to retry */
+#define IBNAL_RNR_RETRY        7                /*  */
+#define IBNAL_CM_RETRY         7                /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL     1
+#define IBNAL_RESPONDER_RESOURCES 8
 
-#define OPENIBNAL_NTX             64              /* # tx descs */
-#define OPENIBNAL_NTX_NBLK        256             /* # reserved tx descs */
+#define IBNAL_NTX             64                /* # tx descs */
+#define IBNAL_NTX_NBLK        256               /* # reserved tx descs */
 
-#define OPENIBNAL_PEER_HASH_SIZE  101             /* # peer lists */
+#define IBNAL_PEER_HASH_SIZE  101               /* # peer lists */
 
-#define OPENIBNAL_RESCHED         100             /* # scheduler loops before reschedule */
+#define IBNAL_RESCHED         100               /* # scheduler loops before reschedule */
 
-#define OPENIBNAL_CONCURRENT_PEERS 1000           /* # nodes all talking at once to me */
+#define IBNAL_CONCURRENT_PEERS 1000             /* # nodes all talking at once to me */
 
 /* default vals for runtime tunables */
-#define OPENIBNAL_IO_TIMEOUT      50              /* default comms timeout (seconds) */
+#define IBNAL_IO_TIMEOUT      50                /* default comms timeout (seconds) */
 
 /************************/
 /* derived constants... */
 
 /* TX messages (shared by all connections) */
-#define OPENIBNAL_TX_MSGS       (OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK)
-#define OPENIBNAL_TX_MSG_BYTES  (OPENIBNAL_TX_MSGS * OPENIBNAL_MSG_SIZE)
-#define OPENIBNAL_TX_MSG_PAGES  ((OPENIBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-
-/* we may have up to 2 completions per transmit */
-#define OPENIBNAL_TX_CQ_ENTRIES  (2*OPENIBNAL_TX_MSGS)
+#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
 /* RX messages (per connection) */
-#define OPENIBNAL_RX_MSGS       OPENIBNAL_MSG_QUEUE_SIZE
-#define OPENIBNAL_RX_MSG_BYTES  (OPENIBNAL_RX_MSGS * OPENIBNAL_MSG_SIZE)
-#define OPENIBNAL_RX_MSG_PAGES  ((OPENIBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
-/* 1 completion per receive, per connection */
-#define OPENIBNAL_RX_CQ_ENTRIES (OPENIBNAL_RX_MSGS * OPENIBNAL_CONCURRENT_PEERS)
+/* we may have up to 2 completions per transmit +
+   1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
+                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
 
-#define OPENIBNAL_RDMA_BASE  0x0eeb0000
-#define OPENIBNAL_FMR        1
-#define OPENIBNAL_CKSUM      0
-//#define OPENIBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
-#define OPENIBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
+#define IBNAL_RDMA_BASE  0x0eeb0000
+#define IBNAL_FMR        1
+#define IBNAL_CKSUM      0
+//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
 
 typedef struct 
 {
-        int               koib_io_timeout;      /* comms timeout (seconds) */
-        struct ctl_table_header *koib_sysctl;   /* sysctl interface */
-} koib_tunables_t;
+        int               kib_io_timeout;       /* comms timeout (seconds) */
+        struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+} kib_tunables_t;
 
 typedef struct
 {
-        int               oibp_npages;          /* # pages */
-        int               oibp_mapped;          /* mapped? */
-        __u64             oibp_vaddr;           /* mapped region vaddr */
-        __u32             oibp_lkey;            /* mapped region lkey */
-        __u32             oibp_rkey;            /* mapped region rkey */
-        struct ib_mr     *oibp_handle;          /* mapped region handle */
-        struct page      *oibp_pages[0];
-} koib_pages_t;
+        int               ibp_npages;           /* # pages */
+        int               ibp_mapped;           /* mapped? */
+        __u64             ibp_vaddr;            /* mapped region vaddr */
+        __u32             ibp_lkey;             /* mapped region lkey */
+        __u32             ibp_rkey;             /* mapped region rkey */
+        struct ib_mr     *ibp_handle;           /* mapped region handle */
+        struct page      *ibp_pages[0];
+} kib_pages_t;
         
 typedef struct 
 {
-        int               koib_init;            /* initialisation state */
-        __u64             koib_incarnation;     /* which one am I */
-        int               koib_shutdown;        /* shut down? */
-        atomic_t          koib_nthreads;        /* # live threads */
-
-        __u64             koib_cm_service_id;   /* service number I listen on */
-        ptl_nid_t         koib_nid;             /* my NID */
-        struct semaphore  koib_nid_mutex;       /* serialise NID ops */
-        struct semaphore  koib_nid_signal;      /* signal completion */
-
-        rwlock_t          koib_global_lock;     /* stabilize peer/conn ops */
-
-        struct list_head *koib_peers;           /* hash table of all my known peers */
-        int               koib_peer_hash_size;  /* size of koib_peers */
-        atomic_t          koib_npeers;          /* # peers extant */
-        atomic_t          koib_nconns;          /* # connections extant */
-
-        struct list_head  koib_connd_conns;     /* connections to progress */
-        struct list_head  koib_connd_peers;     /* peers waiting for a connection */
-        wait_queue_head_t koib_connd_waitq;     /* connection daemons sleep here */
-        unsigned long     koib_connd_waketime;  /* when connd will wake */
-        spinlock_t        koib_connd_lock;      /* serialise */
-
-        wait_queue_head_t koib_sched_waitq;     /* schedulers sleep here */
-        struct list_head  koib_sched_txq;       /* tx requiring attention */
-        struct list_head  koib_sched_rxq;       /* rx requiring attention */
-        spinlock_t        koib_sched_lock;      /* serialise */
+        int               kib_init;             /* initialisation state */
+        __u64             kib_incarnation;      /* which one am I */
+        int               kib_shutdown;         /* shut down? */
+        atomic_t          kib_nthreads;         /* # live threads */
+
+        __u64             kib_service_id;       /* service number I listen on */
+        ptl_nid_t         kib_nid;              /* my NID */
+        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
+        struct semaphore  kib_nid_signal;       /* signal completion */
+
+        rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
+
+        struct list_head *kib_peers;            /* hash table of all my known peers */
+        int               kib_peer_hash_size;   /* size of kib_peers */
+        atomic_t          kib_npeers;           /* # peers extant */
+        atomic_t          kib_nconns;           /* # connections extant */
+
+        struct list_head  kib_connd_conns;      /* connections to progress */
+        struct list_head  kib_connd_peers;      /* peers waiting for a connection */
+        wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
+        unsigned long     kib_connd_waketime;   /* when connd will wake */
+        spinlock_t        kib_connd_lock;       /* serialise */
+
+        wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
+        struct list_head  kib_sched_txq;        /* tx requiring attention */
+        struct list_head  kib_sched_rxq;        /* rx requiring attention */
+        spinlock_t        kib_sched_lock;       /* serialise */
         
-        struct koib_tx   *koib_tx_descs;        /* all the tx descriptors */
-        koib_pages_t     *koib_tx_pages;        /* premapped tx msg pages */
-
-        struct list_head  koib_idle_txs;        /* idle tx descriptors */
-        struct list_head  koib_idle_nblk_txs;   /* idle reserved tx descriptors */
-        wait_queue_head_t koib_idle_tx_waitq;   /* block here for tx descriptor */
-        __u64             koib_next_tx_cookie;  /* RDMA completion cookie */
-        spinlock_t        koib_tx_lock;         /* serialise */
+        struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
+        kib_pages_t      *kib_tx_pages;         /* premapped tx msg pages */
+
+        struct list_head  kib_idle_txs;         /* idle tx descriptors */
+        struct list_head  kib_idle_nblk_txs;    /* idle reserved tx descriptors */
+        wait_queue_head_t kib_idle_tx_waitq;    /* block here for tx descriptor */
+        __u64             kib_next_tx_cookie;   /* RDMA completion cookie */
+        spinlock_t        kib_tx_lock;          /* serialise */
         
-        struct ib_device *koib_device;          /* "the" device */
-        struct ib_device_properties koib_device_props; /* its properties */
-        int               koib_port;            /* port on the device */
-        struct ib_port_properties koib_port_props; /* its properties */
-        struct ib_pd     *koib_pd;              /* protection domain */
-#if OPENIBNAL_FMR
-        struct ib_fmr_pool *koib_fmr_pool;      /* fast memory region pool */
+        struct ib_device *kib_device;           /* "the" device */
+        struct ib_device_properties kib_device_props; /* its properties */
+        int               kib_port;             /* port on the device */
+        struct ib_port_properties kib_port_props; /* its properties */
+        struct ib_pd     *kib_pd;               /* protection domain */
+#if IBNAL_FMR
+        struct ib_fmr_pool *kib_fmr_pool;       /* fast memory region pool */
 #endif
-        struct ib_cq     *koib_rx_cq;           /* receive completion queue */
-        struct ib_cq     *koib_tx_cq;           /* transmit completion queue */
-        void             *koib_listen_handle;   /* where I listen for connections */
-        struct ib_common_attrib_service koib_service; /* SM service */
+        struct ib_cq     *kib_cq;               /* completion queue */
+        void             *kib_listen_handle;    /* where I listen for connections */
         
-} koib_data_t;
-
-#define OPENIBNAL_INIT_NOTHING         0
-#define OPENIBNAL_INIT_DATA            1
-#define OPENIBNAL_INIT_LIB             2
-#define OPENIBNAL_INIT_PD              3
-#define OPENIBNAL_INIT_FMR             4
-#define OPENIBNAL_INIT_TXD             5
-#define OPENIBNAL_INIT_RX_CQ           6
-#define OPENIBNAL_INIT_TX_CQ           7
-#define OPENIBNAL_INIT_ALL             8
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING         0
+#define IBNAL_INIT_DATA            1
+#define IBNAL_INIT_LIB             2
+#define IBNAL_INIT_PD              3
+#define IBNAL_INIT_FMR             4
+#define IBNAL_INIT_TXD             5
+#define IBNAL_INIT_CQ              6
+#define IBNAL_INIT_ALL             7
 
 /************************************************************************
  * Wire message structs.
@@ -214,125 +210,125 @@ typedef struct
         __u32             md_lkey;
         __u32             md_rkey;
         __u64             md_addr;
-} koib_md_t;
+} kib_md_t;
 
 typedef struct
 {
         __u32                 rd_key;           /* remote key */
         __u32                 rd_nob;           /* # of bytes */
         __u64                 rd_addr;          /* remote io vaddr */
-} koib_rdma_desc_t;
+} kib_rdma_desc_t;
 
 
 typedef struct
 {
-        ptl_hdr_t         oibim_hdr;            /* portals header */
-        char              oibim_payload[0];     /* piggy-backed payload */
-} koib_immediate_msg_t;
+        ptl_hdr_t         ibim_hdr;             /* portals header */
+        char              ibim_payload[0];      /* piggy-backed payload */
+} kib_immediate_msg_t;
 
 typedef struct
 {
-        ptl_hdr_t         oibrm_hdr;            /* portals header */
-        __u64             oibrm_cookie;         /* opaque completion cookie */
-        koib_rdma_desc_t  oibrm_desc;           /* where to suck/blow */
-} koib_rdma_msg_t;
+        ptl_hdr_t         ibrm_hdr;             /* portals header */
+        __u64             ibrm_cookie;          /* opaque completion cookie */
+        kib_rdma_desc_t   ibrm_desc;            /* where to suck/blow */
+} kib_rdma_msg_t;
 
 typedef struct
 {
-        __u64             oibcm_cookie;         /* opaque completion cookie */
-        __u32             oibcm_status;         /* completion status */
-} koib_completion_msg_t;
+        __u64             ibcm_cookie;          /* opaque completion cookie */
+        __u32             ibcm_status;          /* completion status */
+} kib_completion_msg_t;
 
 typedef struct
 {
-        __u32              oibm_magic;          /* I'm an openibnal message */
-        __u16              oibm_version;        /* this is my version number */
-        __u8               oibm_type;           /* msg type */
-        __u8               oibm_credits;        /* returned credits */
-#if OPENIBNAL_CKSUM
-        __u32              oibm_nob;
-        __u32              oibm_cksum;
+        __u32              ibm_magic;           /* I'm an openibnal message */
+        __u16              ibm_version;         /* this is my version number */
+        __u8               ibm_type;            /* msg type */
+        __u8               ibm_credits;         /* returned credits */
+#if IBNAL_CKSUM
+        __u32              ibm_nob;
+        __u32              ibm_cksum;
 #endif
         union {
-                koib_immediate_msg_t   immediate;
-                koib_rdma_msg_t        rdma;
-                koib_completion_msg_t  completion;
-        }                    oibm_u;
-} koib_msg_t;
-
-#define OPENIBNAL_MSG_MAGIC       0x0be91b91    /* unique magic */
-#define OPENIBNAL_MSG_VERSION              1    /* current protocol version */
-
-#define OPENIBNAL_MSG_NOOP              0xd0    /* nothing (just credits) */
-#define OPENIBNAL_MSG_IMMEDIATE         0xd1    /* portals hdr + payload */
-#define OPENIBNAL_MSG_PUT_RDMA          0xd2    /* portals PUT hdr + source rdma desc */
-#define OPENIBNAL_MSG_PUT_DONE          0xd3    /* signal PUT rdma completion */
-#define OPENIBNAL_MSG_GET_RDMA          0xd4    /* portals GET hdr + sink rdma desc */
-#define OPENIBNAL_MSG_GET_DONE          0xd5    /* signal GET rdma completion */
+                kib_immediate_msg_t   immediate;
+                kib_rdma_msg_t        rdma;
+                kib_completion_msg_t  completion;
+        }                    ibm_u;
+} kib_msg_t;
+
+#define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
+#define IBNAL_MSG_VERSION              1        /* current protocol version */
+
+#define IBNAL_MSG_NOOP              0xd0        /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE         0xd1        /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA          0xd2        /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE          0xd3        /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA          0xd4        /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE          0xd5        /* signal GET rdma completion */
 
 /***********************************************************************/
 
-typedef struct koib_rx                          /* receive message */
+typedef struct kib_rx                           /* receive message */
 {
         struct list_head          rx_list;      /* queue for attention */
-        struct koib_conn         *rx_conn;      /* owning conn */
+        struct kib_conn          *rx_conn;      /* owning conn */
         int                       rx_rdma;      /* RDMA completion posted? */
         int                       rx_posted;    /* posted? */
         __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
-        koib_msg_t               *rx_msg;       /* pre-mapped buffer (host vaddr) */
+        kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
         struct ib_receive_param   rx_sp;        /* receive work item */
         struct ib_gather_scatter  rx_gl;        /* and it's memory */
-} koib_rx_t;
+} kib_rx_t;
 
-typedef struct koib_tx                          /* transmit message */
+typedef struct kib_tx                           /* transmit message */
 {
         struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
         int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
-        struct koib_conn         *tx_conn;      /* owning conn */
+        struct kib_conn          *tx_conn;      /* owning conn */
         int                       tx_mapped;    /* mapped for RDMA? */
         int                       tx_sending;   /* # tx callbacks outstanding */
         int                       tx_status;    /* completion status */
-        int                       tx_passive_rdma; /* waiting for peer to RDMA? */
-        int                       tx_passive_rdma_wait; /* on ibc_rdma_queue */
-        unsigned long             tx_passive_rdma_deadline; /* completion deadline */
+        unsigned long             tx_deadline;  /* completion deadline */
+        int                       tx_passive_rdma; /* peer sucks/blows */
+        int                       tx_passive_rdma_wait; /* waiting for peer to complete */
         __u64                     tx_passive_rdma_cookie; /* completion cookie */
         lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
-        koib_md_t                 tx_md;        /* RDMA mapping (active/passive) */
+        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
         __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
-        koib_msg_t               *tx_msg;       /* pre-mapped buffer (host vaddr) */
+        kib_msg_t                *tx_msg;       /* pre-mapped buffer (host vaddr) */
         int                       tx_nsp;       /* # send work items */
         struct ib_send_param      tx_sp[2];     /* send work items... */
         struct ib_gather_scatter  tx_gl[2];     /* ...and their memory */
-} koib_tx_t;
+} kib_tx_t;
 
-#define KOIB_TX_UNMAPPED       0
-#define KOIB_TX_MAPPED         1
-#define KOIB_TX_MAPPED_FMR     2
+#define KIB_TX_UNMAPPED       0
+#define KIB_TX_MAPPED         1
+#define KIB_TX_MAPPED_FMR     2
 
-typedef struct koib_wire_connreq
+typedef struct kib_wire_connreq
 {
         __u32        wcr_magic;                 /* I'm an openibnal connreq */
         __u16        wcr_version;               /* this is my version number */
         __u16        wcr_queue_depth;           /* this is my receive queue size */
         __u64        wcr_nid;                   /* peer's NID */
         __u64        wcr_incarnation;           /* peer's incarnation */
-} koib_wire_connreq_t;
+} kib_wire_connreq_t;
 
-typedef struct koib_connreq
+typedef struct kib_connreq
 {
         /* connection-in-progress */
-        struct koib_conn                   *cr_conn;
-        koib_wire_connreq_t                 cr_wcr;
+        struct kib_conn                    *cr_conn;
+        kib_wire_connreq_t                  cr_wcr;
         __u64                               cr_tid;
         struct ib_common_attrib_service     cr_service;
         tTS_IB_GID                          cr_gid;
         struct ib_path_record               cr_path;
         struct ib_cm_active_param           cr_connparam;
-} koib_connreq_t;
+} kib_connreq_t;
 
-typedef struct koib_conn
+typedef struct kib_conn
 { 
-        struct koib_peer   *ibc_peer;           /* owning peer */
+        struct kib_peer    *ibc_peer;           /* owning peer */
         struct list_head    ibc_list;           /* stash on peer's conn list */
         __u64               ibc_incarnation;    /* which instance of the peer */
         atomic_t            ibc_refcount;       /* # users */
@@ -342,27 +338,27 @@ typedef struct koib_conn
         int                 ibc_credits;        /* # credits I have */
         int                 ibc_outstanding_credits; /* # credits to return */
         struct list_head    ibc_tx_queue;       /* send queue */
-        struct list_head    ibc_rdma_queue;     /* tx awaiting RDMA completion */
+        struct list_head    ibc_active_txs;     /* active tx awaiting completion */
         spinlock_t          ibc_lock;           /* serialise */
-        koib_rx_t          *ibc_rxs;            /* the rx descs */
-        koib_pages_t       *ibc_rx_pages;       /* premapped rx msg pages */
+        kib_rx_t           *ibc_rxs;            /* the rx descs */
+        kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
         struct ib_qp       *ibc_qp;             /* queue pair */
         __u32               ibc_qpn;            /* queue pair number */
         tTS_IB_CM_COMM_ID   ibc_comm_id;        /* connection ID? */
-        koib_connreq_t     *ibc_connreq;        /* connection request state */
-} koib_conn_t;
+        kib_connreq_t      *ibc_connreq;        /* connection request state */
+} kib_conn_t;
 
-#define OPENIBNAL_CONN_INIT_NOTHING      0      /* initial state */
-#define OPENIBNAL_CONN_INIT_QP           1      /* ibc_qp set up */
-#define OPENIBNAL_CONN_CONNECTING        2      /* started to connect */
-#define OPENIBNAL_CONN_ESTABLISHED       3      /* connection established */
-#define OPENIBNAL_CONN_DEATHROW          4      /* waiting to be closed */
-#define OPENIBNAL_CONN_ZOMBIE            5      /* waiting to be freed */
+#define IBNAL_CONN_INIT_NOTHING      0          /* initial state */
+#define IBNAL_CONN_INIT_QP           1          /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING        2          /* started to connect */
+#define IBNAL_CONN_ESTABLISHED       3          /* connection established */
+#define IBNAL_CONN_DEATHROW          4          /* waiting to be closed */
+#define IBNAL_CONN_ZOMBIE            5          /* waiting to be freed */
 
-typedef struct koib_peer
+typedef struct kib_peer
 {
         struct list_head    ibp_list;           /* stash on global peer list */
-        struct list_head    ibp_connd_list;     /* schedule on koib_connd_peers */
+        struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
         ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
         atomic_t            ibp_refcount;       /* # users */
         int                 ibp_persistence;    /* "known" peer refs */
@@ -371,30 +367,30 @@ typedef struct koib_peer
         int                 ibp_connecting;     /* connecting+accepting */
         unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
         unsigned long       ibp_reconnect_interval; /* exponential backoff */
-} koib_peer_t;
+} kib_peer_t;
 
 
-extern lib_nal_t        koibnal_lib;
-extern koib_data_t      koibnal_data;
-extern koib_tunables_t  koibnal_tunables;
+extern lib_nal_t       kibnal_lib;
+extern kib_data_t      kibnal_data;
+extern kib_tunables_t  kibnal_tunables;
 
 static inline struct list_head *
-koibnal_nid2peerlist (ptl_nid_t nid) 
+kibnal_nid2peerlist (ptl_nid_t nid) 
 {
-        unsigned int hash = ((unsigned int)nid) % koibnal_data.koib_peer_hash_size;
+        unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
         
-        return (&koibnal_data.koib_peers [hash]);
+        return (&kibnal_data.kib_peers [hash]);
 }
 
 static inline int
-koibnal_peer_active(koib_peer_t *peer)
+kibnal_peer_active(kib_peer_t *peer)
 {
         /* Am I in the peer hash table? */
         return (!list_empty(&peer->ibp_list));
 }
 
 static inline void
-koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn)
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
 {
         /* CAVEAT EMPTOR: tx takes caller's ref on conn */
 
@@ -402,40 +398,41 @@ koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn)
         LASSERT (tx->tx_conn == NULL);          /* only set here */
 
         tx->tx_conn = conn;
+        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
         list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
 }
 
-#define KOIBNAL_SERVICE_KEY_MASK  (IB_SA_SERVICE_COMP_MASK_NAME |       \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_1 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_2 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_3 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_4 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_5 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_6 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_7 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_8)
+#define KIBNAL_SERVICE_KEY_MASK  (IB_SA_SERVICE_COMP_MASK_NAME |        \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_1 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_2 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_3 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_4 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_5 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_6 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_7 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_8)
 
 static inline __u64*
-koibnal_service_nid_field(struct ib_common_attrib_service *srv)
+kibnal_service_nid_field(struct ib_common_attrib_service *srv)
 {
-        /* must be consistent with KOIBNAL_SERVICE_KEY_MASK */
+        /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
         return (__u64 *)srv->service_data8;
 }
 
 
 static inline void
-koibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid)
+kibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid)
 {
-        LASSERT (strlen (OPENIBNAL_SERVICE_NAME) < sizeof(srv->service_name));
+        LASSERT (strlen (IBNAL_SERVICE_NAME) < sizeof(srv->service_name));
         memset (srv->service_name, 0, sizeof(srv->service_name));
-        strcpy (srv->service_name, OPENIBNAL_SERVICE_NAME);
+        strcpy (srv->service_name, IBNAL_SERVICE_NAME);
 
-        *koibnal_service_nid_field(srv) = cpu_to_le64(nid);
+        *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
 }
 
 #if 0
 static inline void
-koibnal_show_rdma_attr (koib_conn_t *conn)
+kibnal_show_rdma_attr (kib_conn_t *conn)
 {
         struct ib_qp_attribute qp_attr;
         int                    rc;
@@ -457,7 +454,7 @@ koibnal_show_rdma_attr (koib_conn_t *conn)
 
 #if CONFIG_X86
 static inline __u64
-koibnal_page2phys (struct page *p)
+kibnal_page2phys (struct page *p)
 {
         __u64 page_number = p - mem_map;
         
@@ -467,42 +464,69 @@ koibnal_page2phys (struct page *p)
 # error "no page->phys"
 #endif
 
-extern koib_peer_t *koibnal_create_peer (ptl_nid_t nid);
-extern void koibnal_put_peer (koib_peer_t *peer);
-extern int koibnal_del_peer (ptl_nid_t nid, int single_share);
-extern koib_peer_t *koibnal_find_peer_locked (ptl_nid_t nid);
-extern void koibnal_unlink_peer_locked (koib_peer_t *peer);
-extern int  koibnal_close_stale_conns_locked (koib_peer_t *peer, 
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive.  It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+        unsigned long lptr = (unsigned long)ptr;
+
+        LASSERT ((lptr & 1) == 0);
+        return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+        return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+        return (wreqid & 1) != 0;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_put_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer, 
                                               __u64 incarnation);
-extern koib_conn_t *koibnal_create_conn (void);
-extern void koibnal_put_conn (koib_conn_t *conn);
-extern void koibnal_destroy_conn (koib_conn_t *conn);
-extern int koibnal_alloc_pages (koib_pages_t **pp, int npages, int access);
-extern void koibnal_free_pages (koib_pages_t *p);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
 
-extern void koibnal_check_sends (koib_conn_t *conn);
+extern void kibnal_check_sends (kib_conn_t *conn);
 
 extern tTS_IB_CM_CALLBACK_RETURN
-koibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
+kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
                        void *param, void *arg);
 extern tTS_IB_CM_CALLBACK_RETURN 
-koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
+kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
                                void *param, void *arg);
 
-extern void koibnal_close_conn_locked (koib_conn_t *conn, int error);
-extern void koibnal_destroy_conn (koib_conn_t *conn);
-extern int  koibnal_thread_start (int (*fn)(void *arg), void *arg);
-extern int  koibnal_scheduler(void *arg);
-extern int  koibnal_connd (void *arg);
-extern void koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
-extern void koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
-extern void koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob);
-extern int  koibnal_close_conn (koib_conn_t *conn, int why);
-extern void koibnal_start_active_rdma (int type, int status
-                                       koib_rx_t *rx, lib_msg_t *libmsg
-                                       unsigned int niov, 
-                                       struct iovec *iov, ptl_kiov_t *kiov,
-                                       size_t offset, size_t nob);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int  kibnal_scheduler(void *arg);
+extern int  kibnal_connd (void *arg);
+extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern int  kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status, 
+                                      kib_rx_t *rx, lib_msg_t *libmsg
+                                      unsigned int niov
+                                      struct iovec *iov, ptl_kiov_t *kiov,
+                                      size_t offset, size_t nob);
+
 
 
 
index 79bf37a..d774853 100644 (file)
  *
  */
 void
-koibnal_schedule_tx_done (koib_tx_t *tx)
+kibnal_schedule_tx_done (kib_tx_t *tx)
 {
         unsigned long flags;
 
-        spin_lock_irqsave (&koibnal_data.koib_sched_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
 
-        list_add_tail(&tx->tx_list, &koibnal_data.koib_sched_txq);
-        wake_up (&koibnal_data.koib_sched_waitq);
+        list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+        wake_up (&kibnal_data.kib_sched_waitq);
 
-        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
 }
 
 void
-koibnal_tx_done (koib_tx_t *tx)
+kibnal_tx_done (kib_tx_t *tx)
 {
         ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
         unsigned long    flags;
@@ -49,31 +49,31 @@ koibnal_tx_done (koib_tx_t *tx)
         int              rc;
 
         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
-        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be on ibc_rdma_queue */
+        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
 
         switch (tx->tx_mapped) {
         default:
                 LBUG();
 
-        case KOIB_TX_UNMAPPED:
+        case KIB_TX_UNMAPPED:
                 break;
                 
-        case KOIB_TX_MAPPED:
+        case KIB_TX_MAPPED:
                 if (in_interrupt()) {
                         /* can't deregister memory in IRQ context... */
-                        koibnal_schedule_tx_done(tx);
+                        kibnal_schedule_tx_done(tx);
                         return;
                 }
                 rc = ib_memory_deregister(tx->tx_md.md_handle.mr);
                 LASSERT (rc == 0);
-                tx->tx_mapped = KOIB_TX_UNMAPPED;
+                tx->tx_mapped = KIB_TX_UNMAPPED;
                 break;
 
-#if OPENIBNAL_FMR
-        case KOIB_TX_MAPPED_FMR:
+#if IBNAL_FMR
+        case KIB_TX_MAPPED_FMR:
                 if (in_interrupt() && tx->tx_status != 0) {
                         /* can't flush FMRs in IRQ context... */
-                        koibnal_schedule_tx_done(tx);
+                        kibnal_schedule_tx_done(tx);
                         return;
                 }              
 
@@ -81,8 +81,8 @@ koibnal_tx_done (koib_tx_t *tx)
                 LASSERT (rc == 0);
 
                 if (tx->tx_status != 0)
-                        ib_fmr_pool_force_flush(koibnal_data.koib_fmr_pool);
-                tx->tx_mapped = KOIB_TX_UNMAPPED;
+                        ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
                 break;
 #endif
         }
@@ -92,12 +92,12 @@ koibnal_tx_done (koib_tx_t *tx)
                 if (tx->tx_libmsg[i] == NULL)
                         continue;
 
-                lib_finalize (&koibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+                lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
                 tx->tx_libmsg[i] = NULL;
         }
         
         if (tx->tx_conn != NULL) {
-                koibnal_put_conn (tx->tx_conn);
+                kibnal_put_conn (tx->tx_conn);
                 tx->tx_conn = NULL;
         }
 
@@ -105,52 +105,52 @@ koibnal_tx_done (koib_tx_t *tx)
         tx->tx_passive_rdma = 0;
         tx->tx_status = 0;
 
-        spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
 
         if (tx->tx_isnblk) {
-                list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_nblk_txs);
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
         } else {
-                list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_txs);
-                wake_up (&koibnal_data.koib_idle_tx_waitq);
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+                wake_up (&kibnal_data.kib_idle_tx_waitq);
         }
 
-        spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
 }
 
-koib_tx_t *
-koibnal_get_idle_tx (int may_block) 
+kib_tx_t *
+kibnal_get_idle_tx (int may_block) 
 {
-        unsigned long    flags;
-        koib_tx_t    *tx = NULL;
+        unsigned long  flags;
+        kib_tx_t      *tx = NULL;
         
         for (;;) {
-                spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags);
+                spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
 
                 /* "normal" descriptor is free */
-                if (!list_empty (&koibnal_data.koib_idle_txs)) {
-                        tx = list_entry (koibnal_data.koib_idle_txs.next,
-                                         koib_tx_t, tx_list);
+                if (!list_empty (&kibnal_data.kib_idle_txs)) {
+                        tx = list_entry (kibnal_data.kib_idle_txs.next,
+                                         kib_tx_t, tx_list);
                         break;
                 }
 
                 if (!may_block) {
                         /* may dip into reserve pool */
-                        if (list_empty (&koibnal_data.koib_idle_nblk_txs)) {
+                        if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
                                 CERROR ("reserved tx desc pool exhausted\n");
                                 break;
                         }
 
-                        tx = list_entry (koibnal_data.koib_idle_nblk_txs.next,
-                                         koib_tx_t, tx_list);
+                        tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+                                         kib_tx_t, tx_list);
                         break;
                 }
 
                 /* block for idle tx */
-                spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+                spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
 
-                wait_event (koibnal_data.koib_idle_tx_waitq,
-                            !list_empty (&koibnal_data.koib_idle_txs) ||
-                            koibnal_data.koib_shutdown);
+                wait_event (kibnal_data.kib_idle_tx_waitq,
+                            !list_empty (&kibnal_data.kib_idle_txs) ||
+                            kibnal_data.kib_shutdown);
         }
 
         if (tx != NULL) {
@@ -159,9 +159,9 @@ koibnal_get_idle_tx (int may_block)
                 /* Allocate a new passive RDMA completion cookie.  It might
                  * not be needed, but we've got a lock right now and we're
                  * unlikely to wrap... */
-                tx->tx_passive_rdma_cookie = koibnal_data.koib_next_tx_cookie++;
+                tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
 
-                LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
                 LASSERT (tx->tx_nsp == 0);
                 LASSERT (tx->tx_sending == 0);
                 LASSERT (tx->tx_status == 0);
@@ -172,15 +172,15 @@ koibnal_get_idle_tx (int may_block)
                 LASSERT (tx->tx_libmsg[1] == NULL);
         }
 
-        spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
         
         return (tx);
 }
 
 int
-koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
 {
-        /* I would guess that if koibnal_get_peer (nid) == NULL,
+        /* I would guess that if kibnal_get_peer (nid) == NULL,
            and we're not routing, then 'nid' is very distant :) */
         if ( nal->libnal_ni.ni_pid.nid == nid ) {
                 *dist = 0;
@@ -192,7 +192,7 @@ koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
 }
 
 void
-koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
 {
         struct list_head *ttmp;
         unsigned long     flags;
@@ -200,30 +200,34 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
-        list_for_each (ttmp, &conn->ibc_rdma_queue) {
-                koib_tx_t *tx = list_entry(ttmp, koib_tx_t, tx_list);
-                
-                LASSERT (tx->tx_passive_rdma);
-                LASSERT (tx->tx_passive_rdma_wait);
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
 
-                if (tx->tx_passive_rdma_cookie != cookie)
-                        continue;
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
 
-                CDEBUG(D_NET, "Complete %p "LPD64"\n", tx, cookie);
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
 
-                list_del (&tx->tx_list);
+                if (!tx->tx_passive_rdma_wait ||
+                    tx->tx_passive_rdma_cookie != cookie)
+                        continue;
+
+                CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
 
+                tx->tx_status = status;
                 tx->tx_passive_rdma_wait = 0;
                 idle = (tx->tx_sending == 0);
 
-                tx->tx_status = status;
+                if (idle)
+                        list_del (&tx->tx_list);
 
                 spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
                 /* I could be racing with tx callbacks.  It's whoever
                  * _makes_ tx idle that frees it */
                 if (idle)
-                        koibnal_tx_done (tx);
+                        kibnal_tx_done (tx);
                 return;
         }
                 
@@ -234,32 +238,32 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
 }
 
 void
-koibnal_post_rx (koib_rx_t *rx, int do_credits)
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
 {
-        koib_conn_t  *conn = rx->rx_conn;
+        kib_conn_t   *conn = rx->rx_conn;
         int           rc;
         unsigned long flags;
 
         rx->rx_gl = (struct ib_gather_scatter) {
                 .address = rx->rx_vaddr,
-                .length  = OPENIBNAL_MSG_SIZE,
-                .key     = conn->ibc_rx_pages->oibp_lkey,
+                .length  = IBNAL_MSG_SIZE,
+                .key     = conn->ibc_rx_pages->ibp_lkey,
         };
-        
+
         rx->rx_sp = (struct ib_receive_param) {
-                .work_request_id        = (__u64)(unsigned long)rx,
+                .work_request_id        = kibnal_ptr2wreqid(rx, 1),
                 .scatter_list           = &rx->rx_gl,
                 .num_scatter_entries    = 1,
                 .device_specific        = NULL,
                 .signaled               = 1,
         };
 
-        LASSERT (conn->ibc_state >= OPENIBNAL_CONN_ESTABLISHED);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
         LASSERT (!rx->rx_posted);
         rx->rx_posted = 1;
         mb();
 
-        if (conn->ibc_state != OPENIBNAL_CONN_ESTABLISHED)
+        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
                 rc = -ECONNABORTED;
         else
                 rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1);
@@ -270,26 +274,26 @@ koibnal_post_rx (koib_rx_t *rx, int do_credits)
                         conn->ibc_outstanding_credits++;
                         spin_unlock_irqrestore(&conn->ibc_lock, flags);
 
-                        koibnal_check_sends(conn);
+                        kibnal_check_sends(conn);
                 }
                 return;
         }
 
-        if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
                 CERROR ("Error posting receive -> "LPX64": %d\n",
                         conn->ibc_peer->ibp_nid, rc);
-                koibnal_close_conn (rx->rx_conn, rc);
+                kibnal_close_conn (rx->rx_conn, rc);
         } else {
                 CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
                         conn->ibc_peer->ibp_nid, rc);
         }
 
         /* Drop rx's ref */
-        koibnal_put_conn (conn);
+        kibnal_put_conn (conn);
 }
 
-#if OPENIBNAL_CKSUM
-__u32 koibnal_cksum (void *ptr, int nob)
+#if IBNAL_CKSUM
+__u32 kibnal_cksum (void *ptr, int nob)
 {
         char  *c  = ptr;
         __u32  sum = 0;
@@ -302,17 +306,17 @@ __u32 koibnal_cksum (void *ptr, int nob)
 #endif
 
 void
-koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_rx_callback (struct ib_cq_entry *e)
 {
-        koib_rx_t    *rx = (koib_rx_t *)((unsigned long)e->work_request_id);
-        koib_msg_t   *msg = rx->rx_msg;
-        koib_conn_t  *conn = rx->rx_conn;
+        kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id);
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
         int           nob = e->bytes_transferred;
-        const int     base_nob = offsetof(koib_msg_t, oibm_u);
+        const int     base_nob = offsetof(kib_msg_t, ibm_u);
         int           credits;
         int           flipped;
         unsigned long flags;
-#if OPENIBNAL_CKSUM
+#if IBNAL_CKSUM
         __u32         msg_cksum;
         __u32         computed_cksum;
 #endif
@@ -324,11 +328,11 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
 
         /* receives complete with error in any case after we've started
          * closing the QP */
-        if (conn->ibc_state >= OPENIBNAL_CONN_DEATHROW)
+        if (conn->ibc_state >= IBNAL_CONN_DEATHROW)
                 goto failed;
 
         /* We don't post receives until the conn is established */
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED);
+        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
 
         if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
                 CERROR("Rx from "LPX64" failed: %d\n", 
@@ -344,35 +348,35 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
 
         /* Receiver does any byte flipping if necessary... */
 
-        if (msg->oibm_magic == OPENIBNAL_MSG_MAGIC) {
+        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
                 flipped = 0;
         } else {
-                if (msg->oibm_magic != __swab32(OPENIBNAL_MSG_MAGIC)) {
+                if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
                         CERROR ("Unrecognised magic: %08x from "LPX64"\n", 
-                                msg->oibm_magic, conn->ibc_peer->ibp_nid);
+                                msg->ibm_magic, conn->ibc_peer->ibp_nid);
                         goto failed;
                 }
                 flipped = 1;
-                __swab16s (&msg->oibm_version);
-                LASSERT (sizeof(msg->oibm_type) == 1);
-                LASSERT (sizeof(msg->oibm_credits) == 1);
+                __swab16s (&msg->ibm_version);
+                LASSERT (sizeof(msg->ibm_type) == 1);
+                LASSERT (sizeof(msg->ibm_credits) == 1);
         }
 
-        if (msg->oibm_version != OPENIBNAL_MSG_VERSION) {
+        if (msg->ibm_version != IBNAL_MSG_VERSION) {
                 CERROR ("Incompatible msg version %d (%d expected)\n",
-                        msg->oibm_version, OPENIBNAL_MSG_VERSION);
+                        msg->ibm_version, IBNAL_MSG_VERSION);
                 goto failed;
         }
 
-#if OPENIBNAL_CKSUM
-        if (nob != msg->oibm_nob) {
-                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->oibm_nob);
+#if IBNAL_CKSUM
+        if (nob != msg->ibm_nob) {
+                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
                 goto failed;
         }
 
-        msg_cksum = le32_to_cpu(msg->oibm_cksum);
-        msg->oibm_cksum = 0;
-        computed_cksum = koibnal_cksum (msg, nob);
+        msg_cksum = le32_to_cpu(msg->ibm_cksum);
+        msg->ibm_cksum = 0;
+        computed_cksum = kibnal_cksum (msg, nob);
         
         if (msg_cksum != computed_cksum) {
                 CERROR ("Checksum failure %d: (%d expected)\n",
@@ -383,101 +387,101 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
 #endif
 
         /* Have I received credits that will let me send? */
-        credits = msg->oibm_credits;
+        credits = msg->ibm_credits;
         if (credits != 0) {
                 spin_lock_irqsave(&conn->ibc_lock, flags);
                 conn->ibc_credits += credits;
                 spin_unlock_irqrestore(&conn->ibc_lock, flags);
                 
-                koibnal_check_sends(conn);
+                kibnal_check_sends(conn);
         }
 
-        switch (msg->oibm_type) {
-        case OPENIBNAL_MSG_NOOP:
-                koibnal_post_rx (rx, 1);
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_NOOP:
+                kibnal_post_rx (rx, 1);
                 return;
 
-        case OPENIBNAL_MSG_IMMEDIATE:
-                if (nob < base_nob + sizeof (koib_immediate_msg_t)) {
+        case IBNAL_MSG_IMMEDIATE:
+                if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
                         CERROR ("Short IMMEDIATE from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid, nob);
                         goto failed;
                 }
                 break;
                 
-        case OPENIBNAL_MSG_PUT_RDMA:
-        case OPENIBNAL_MSG_GET_RDMA:
-                if (nob < base_nob + sizeof (koib_rdma_msg_t)) {
+        case IBNAL_MSG_PUT_RDMA:
+        case IBNAL_MSG_GET_RDMA:
+                if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
                         CERROR ("Short RDMA msg from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid, nob);
                         goto failed;
                 }
                 if (flipped) {
-                        __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_key);
-                        __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_nob);
-                        __swab64s(&msg->oibm_u.rdma.oibrm_desc.rd_addr);
+                        __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
+                        __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
+                        __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
                 }
                 CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n",
-                       msg->oibm_type, msg->oibm_u.rdma.oibrm_cookie,
-                       msg->oibm_u.rdma.oibrm_desc.rd_key,
-                       msg->oibm_u.rdma.oibrm_desc.rd_addr,
-                       msg->oibm_u.rdma.oibrm_desc.rd_nob);
+                       msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie,
+                       msg->ibm_u.rdma.ibrm_desc.rd_key,
+                       msg->ibm_u.rdma.ibrm_desc.rd_addr,
+                       msg->ibm_u.rdma.ibrm_desc.rd_nob);
                 break;
                 
-        case OPENIBNAL_MSG_PUT_DONE:
-        case OPENIBNAL_MSG_GET_DONE:
-                if (nob < base_nob + sizeof (koib_completion_msg_t)) {
+        case IBNAL_MSG_PUT_DONE:
+        case IBNAL_MSG_GET_DONE:
+                if (nob < base_nob + sizeof (kib_completion_msg_t)) {
                         CERROR ("Short COMPLETION msg from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid, nob);
                         goto failed;
                 }
                 if (flipped)
-                        __swab32s(&msg->oibm_u.completion.oibcm_status);
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
                 
                 CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
-                       msg->oibm_type, msg->oibm_u.completion.oibcm_cookie,
-                       msg->oibm_u.completion.oibcm_status);
+                       msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+                       msg->ibm_u.completion.ibcm_status);
 
-                koibnal_complete_passive_rdma (conn, 
-                                               msg->oibm_u.completion.oibcm_cookie,
-                                               msg->oibm_u.completion.oibcm_status);
-                koibnal_post_rx (rx, 1);
+                kibnal_complete_passive_rdma (conn, 
+                                              msg->ibm_u.completion.ibcm_cookie,
+                                              msg->ibm_u.completion.ibcm_status);
+                kibnal_post_rx (rx, 1);
                 return;
                         
         default:
                 CERROR ("Can't parse type from "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, msg->oibm_type);
+                        conn->ibc_peer->ibp_nid, msg->ibm_type);
                 goto failed;
         }
 
-        /* schedule for koibnal_rx() in thread context */
-        spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags);
+        /* schedule for kibnal_rx() in thread context */
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
         
-        list_add_tail (&rx->rx_list, &koibnal_data.koib_sched_rxq);
-        wake_up (&koibnal_data.koib_sched_waitq);
+        list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+        wake_up (&kibnal_data.kib_sched_waitq);
         
-        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
         return;
         
  failed:
         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
-        koibnal_close_conn(conn, -ECONNABORTED);
+        kibnal_close_conn(conn, -ECONNABORTED);
 
         /* Don't re-post rx & drop its ref on conn */
-        koibnal_put_conn(conn);
+        kibnal_put_conn(conn);
 }
 
 void
-koibnal_rx (koib_rx_t *rx)
+kibnal_rx (kib_rx_t *rx)
 {
-        koib_msg_t   *msg = rx->rx_msg;
+        kib_msg_t   *msg = rx->rx_msg;
 
         /* Clear flag so I can detect if I've sent an RDMA completion */
         rx->rx_rdma = 0;
 
-        switch (msg->oibm_type) {
-        case OPENIBNAL_MSG_GET_RDMA:
-                lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx);
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_GET_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
                 /* If the incoming get was matched, I'll have initiated the
                  * RDMA and the completion message... */
                 if (rx->rx_rdma)
@@ -487,12 +491,12 @@ koibnal_rx (koib_rx_t *rx)
                  * the peer's GET blocking for the full timeout. */
                 CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
                         rx->rx_conn->ibc_peer->ibp_nid);
-                koibnal_start_active_rdma (OPENIBNAL_MSG_GET_DONE, -EIO,
-                                           rx, NULL, 0, NULL, NULL, 0, 0);
+                kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+                                          rx, NULL, 0, NULL, NULL, 0, 0);
                 break;
                 
-        case OPENIBNAL_MSG_PUT_RDMA:
-                lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx);
+        case IBNAL_MSG_PUT_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
                 if (rx->rx_rdma)
                         break;
                 /* This is most unusual, since even if lib_parse() didn't
@@ -505,8 +509,8 @@ koibnal_rx (koib_rx_t *rx)
                         rx->rx_conn->ibc_peer->ibp_nid);
                 break;
 
-        case OPENIBNAL_MSG_IMMEDIATE:
-                lib_parse(&koibnal_lib, &msg->oibm_u.immediate.oibim_hdr, rx);
+        case IBNAL_MSG_IMMEDIATE:
+                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
                 LASSERT (!rx->rx_rdma);
                 break;
                 
@@ -515,12 +519,12 @@ koibnal_rx (koib_rx_t *rx)
                 break;
         }
 
-        koibnal_post_rx (rx, 1);
+        kibnal_post_rx (rx, 1);
 }
 
 #if 0
 int
-koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
+kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
 {
         struct page *page;
 
@@ -531,7 +535,7 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
         else if (vaddr >= PKMAP_BASE &&
                  vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
                 page = vmalloc_to_page ((void *)vaddr);
-                /* in 2.4 ^ just walks the page tables */
+        /* in 2.4 ^ just walks the page tables */
 #endif
         else
                 page = virt_to_page (vaddr);
@@ -540,13 +544,13 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
             !VALID_PAGE (page))
                 return (-EFAULT);
 
-        *physp = koibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
+        *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
         return (0);
 }
 #endif
 
 int
-koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
+kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access,
                  int niov, struct iovec *iov, int offset, int nob)
                  
 {
@@ -555,7 +559,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
 
         LASSERT (nob > 0);
         LASSERT (niov > 0);
-        LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
 
         while (offset >= iov->iov_len) {
                 offset -= iov->iov_len;
@@ -572,7 +576,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
         vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
         tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
 
-        rc = ib_memory_register (koibnal_data.koib_pd,
+        rc = ib_memory_register (kibnal_data.kib_pd,
                                  vaddr, nob,
                                  access,
                                  &tx->tx_md.md_handle.mr,
@@ -584,21 +588,21 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
                 return (rc);
         }
 
-        tx->tx_mapped = KOIB_TX_MAPPED;
+        tx->tx_mapped = KIB_TX_MAPPED;
         return (0);
 }
 
 int
-koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
+kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
                   int nkiov, ptl_kiov_t *kiov,
                   int offset, int nob)
 {
-#if OPENIBNAL_FMR
+#if IBNAL_FMR
         __u64                      *phys;
-        const int                   mapped = KOIB_TX_MAPPED_FMR;
+        const int                   mapped = KIB_TX_MAPPED_FMR;
 #else
         struct ib_physical_buffer  *phys;
-        const int                   mapped = KOIB_TX_MAPPED;
+        const int                   mapped = KIB_TX_MAPPED;
 #endif
         int                         page_offset;
         int                         nphys;
@@ -610,7 +614,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
 
         LASSERT (nob > 0);
         LASSERT (nkiov > 0);
-        LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
 
         while (offset >= kiov->kiov_len) {
                 offset -= kiov->kiov_len;
@@ -627,10 +631,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
         }
 
         page_offset = kiov->kiov_offset + offset;
-#if OPENIBNAL_FMR
-        phys[0] = koibnal_page2phys(kiov->kiov_page);
+#if IBNAL_FMR
+        phys[0] = kibnal_page2phys(kiov->kiov_page);
 #else
-        phys[0].address = koibnal_page2phys(kiov->kiov_page);
+        phys[0].address = kibnal_page2phys(kiov->kiov_page);
         phys[0].size = PAGE_SIZE;
 #endif
         nphys = 1;
@@ -667,10 +671,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
                 }
 
                 LASSERT (nphys * sizeof (*phys) < phys_size);
-#if OPENIBNAL_FMR
-                phys[nphys] = koibnal_page2phys(kiov->kiov_page);
+#if IBNAL_FMR
+                phys[nphys] = kibnal_page2phys(kiov->kiov_page);
 #else
-                phys[nphys].address = koibnal_page2phys(kiov->kiov_page);
+                phys[nphys].address = kibnal_page2phys(kiov->kiov_page);
                 phys[nphys].size = PAGE_SIZE;
 #endif
                 nphys++;
@@ -683,10 +687,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
         for (rc = 0; rc < nphys; rc++)
                 CWARN ("   [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size);
 #endif
-        tx->tx_md.md_addr = OPENIBNAL_RDMA_BASE;
+        tx->tx_md.md_addr = IBNAL_RDMA_BASE;
 
-#if OPENIBNAL_FMR
-        rc = ib_fmr_register_physical (koibnal_data.koib_fmr_pool,
+#if IBNAL_FMR
+        rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
                                        phys, nphys,
                                        &tx->tx_md.md_addr,
                                        page_offset,
@@ -694,7 +698,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
                                        &tx->tx_md.md_lkey,
                                        &tx->tx_md.md_rkey);
 #else
-        rc = ib_memory_register_physical (koibnal_data.koib_pd,
+        rc = ib_memory_register_physical (kibnal_data.kib_pd,
                                           phys, nphys,
                                           &tx->tx_md.md_addr,
                                           nob, page_offset,
@@ -717,24 +721,24 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
         return (rc);
 }
 
-koib_conn_t *
-koibnal_find_conn_locked (koib_peer_t *peer)
+kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
 {
         struct list_head *tmp;
 
         /* just return the first connection */
         list_for_each (tmp, &peer->ibp_conns) {
-                return (list_entry(tmp, koib_conn_t, ibc_list));
+                return (list_entry(tmp, kib_conn_t, ibc_list));
         }
 
         return (NULL);
 }
 
 void
-koibnal_check_sends (koib_conn_t *conn)
+kibnal_check_sends (kib_conn_t *conn)
 {
         unsigned long   flags;
-        koib_tx_t      *tx;
+        kib_tx_t       *tx;
         int             rc;
         int             i;
         int             done;
@@ -742,39 +746,39 @@ koibnal_check_sends (koib_conn_t *conn)
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
+        LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
         if (list_empty(&conn->ibc_tx_queue) &&
-            conn->ibc_outstanding_credits >= OPENIBNAL_CREDIT_HIGHWATER) {
+            conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
                 spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
-                tx = koibnal_get_idle_tx(0);     /* don't block */
+                
+                tx = kibnal_get_idle_tx(0);     /* don't block */
                 if (tx != NULL)
-                        koibnal_init_tx_msg(tx, OPENIBNAL_MSG_NOOP, 0);
+                        kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
 
                 spin_lock_irqsave(&conn->ibc_lock, flags);
-
+                
                 if (tx != NULL) {
                         atomic_inc(&conn->ibc_refcount);
-                        koibnal_queue_tx_locked(tx, conn);
+                        kibnal_queue_tx_locked(tx, conn);
                 }
         }
 
-        LASSERT (conn->ibc_nsends_posted <= OPENIBNAL_MSG_QUEUE_SIZE);
-
         while (!list_empty (&conn->ibc_tx_queue)) {
-                tx = list_entry (conn->ibc_tx_queue.next, koib_tx_t, tx_list);
+                tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
 
                 /* We rely on this for QP sizing */
                 LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2);
 
                 LASSERT (conn->ibc_outstanding_credits >= 0);
-                LASSERT (conn->ibc_outstanding_credits <= OPENIBNAL_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
                 LASSERT (conn->ibc_credits >= 0);
-                LASSERT (conn->ibc_credits <= OPENIBNAL_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
 
                 /* Not on ibc_rdma_queue */
                 LASSERT (!tx->tx_passive_rdma_wait);
 
-                if (conn->ibc_nsends_posted == OPENIBNAL_MSG_QUEUE_SIZE)
+                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
                         break;
 
                 if (conn->ibc_credits == 0)     /* no credits */
@@ -786,37 +790,29 @@ koibnal_check_sends (koib_conn_t *conn)
 
                 list_del (&tx->tx_list);
 
-                if (tx->tx_msg->oibm_type == OPENIBNAL_MSG_NOOP &&
+                if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
                     (!list_empty(&conn->ibc_tx_queue) ||
-                     conn->ibc_outstanding_credits < OPENIBNAL_CREDIT_HIGHWATER)) {
-                        /* Redundant NOOP */
+                     conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+                        /* redundant NOOP */
                         spin_unlock_irqrestore(&conn->ibc_lock, flags);
-                        koibnal_tx_done(tx);
+                        kibnal_tx_done(tx);
                         spin_lock_irqsave(&conn->ibc_lock, flags);
                         continue;
                 }
-                
-                /* incoming RDMA completion can find this one now */
-                if (tx->tx_passive_rdma) {
-                        list_add (&tx->tx_list, &conn->ibc_rdma_queue);
-                        tx->tx_passive_rdma_wait = 1;
-                        tx->tx_passive_rdma_deadline = 
-                                jiffies + koibnal_tunables.koib_io_timeout * HZ;
-                }
 
-                tx->tx_msg->oibm_credits = conn->ibc_outstanding_credits;
+                tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
                 conn->ibc_outstanding_credits = 0;
 
-                /* use the free memory barrier when we unlock to ensure
-                 * sending set before we can get the tx callback. */
                 conn->ibc_nsends_posted++;
                 conn->ibc_credits--;
-                tx->tx_sending = tx->tx_nsp;
 
-#if OPENIBNAL_CKSUM
-                tx->tx_msg->oibm_cksum = 0;
-                tx->tx_msg->oibm_cksum = koibnal_cksum(tx->tx_msg, tx->tx_msg->oibm_nob);
-                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->oibm_cksum, tx->tx_msg->oibm_nob);
+                tx->tx_sending = tx->tx_nsp;
+                tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+                list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+                tx->tx_msg->ibm_cksum = 0;
+                tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
 #endif
                 spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
@@ -827,7 +823,7 @@ koibnal_check_sends (koib_conn_t *conn)
 
                 rc = -ECONNABORTED;
                 nwork = 0;
-                if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
                         tx->tx_status = 0;
                         /* Driver only accepts 1 item at a time */
                         for (i = 0; i < tx->tx_nsp; i++) {
@@ -842,31 +838,31 @@ koibnal_check_sends (koib_conn_t *conn)
                 if (rc != 0) {
                         /* NB credits are transferred in the actual
                          * message, which can only be the last work item */
-                        conn->ibc_outstanding_credits += tx->tx_msg->oibm_credits;
+                        conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
                         conn->ibc_credits++;
                         conn->ibc_nsends_posted--;
-                        tx->tx_sending -= tx->tx_nsp - nwork;
+
                         tx->tx_status = rc;
+                        tx->tx_passive_rdma_wait = 0;
+                        tx->tx_sending -= tx->tx_nsp - nwork;
+
                         done = (tx->tx_sending == 0);
-                        
-                        if (tx->tx_passive_rdma) {
-                                tx->tx_passive_rdma_wait = 0;
+                        if (done)
                                 list_del (&tx->tx_list);
-                        }
                         
                         spin_unlock_irqrestore (&conn->ibc_lock, flags);
                         
-                        if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED)
+                        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
                                 CERROR ("Error %d posting transmit to "LPX64"\n", 
                                         rc, conn->ibc_peer->ibp_nid);
                         else
                                 CDEBUG (D_NET, "Error %d posting transmit to "
                                         LPX64"\n", rc, conn->ibc_peer->ibp_nid);
 
-                        koibnal_close_conn (conn, rc);
+                        kibnal_close_conn (conn, rc);
 
                         if (done)
-                                koibnal_tx_done (tx);
+                                kibnal_tx_done (tx);
                         return;
                 }
                 
@@ -876,10 +872,10 @@ koibnal_check_sends (koib_conn_t *conn)
 }
 
 void
-koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_tx_callback (struct ib_cq_entry *e)
 {
-        koib_tx_t    *tx = (koib_tx_t *)((unsigned long)e->work_request_id);
-        koib_conn_t  *conn;
+        kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id);
+        kib_conn_t   *conn;
         unsigned long flags;
         int           idle;
 
@@ -901,6 +897,8 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
         tx->tx_sending--;
         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
                (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
+        if (idle)
+                list_del(&tx->tx_list);
 
         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
@@ -917,53 +915,62 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
         spin_unlock_irqrestore(&conn->ibc_lock, flags);
 
         if (idle)
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
 
         if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
                 CERROR ("Tx completion to "LPX64" failed: %d\n", 
                         conn->ibc_peer->ibp_nid, e->status);
-                koibnal_close_conn (conn, -ENETDOWN);
+                kibnal_close_conn (conn, -ENETDOWN);
         } else {
                 /* can I shovel some more sends out the door? */
-                koibnal_check_sends(conn);
+                kibnal_check_sends(conn);
         }
 
-        koibnal_put_conn (conn);
+        kibnal_put_conn (conn);
 }
 
 void
-koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob)
+kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+{
+        if (kibnal_wreqid_is_rx(e->work_request_id))
+                kibnal_rx_callback (e);
+        else
+                kibnal_tx_callback (e);
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
 {
         struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp];
         struct ib_send_param     *sp = &tx->tx_sp[tx->tx_nsp];
         int                       fence;
-        int                       nob = offsetof (koib_msg_t, oibm_u) + body_nob;
+        int                       nob = offsetof (kib_msg_t, ibm_u) + body_nob;
 
         LASSERT (tx->tx_nsp >= 0 && 
                  tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0]));
-        LASSERT (nob <= OPENIBNAL_MSG_SIZE);
+        LASSERT (nob <= IBNAL_MSG_SIZE);
         
-        tx->tx_msg->oibm_magic = OPENIBNAL_MSG_MAGIC;
-        tx->tx_msg->oibm_version = OPENIBNAL_MSG_VERSION;
-        tx->tx_msg->oibm_type = type;
-#if OPENIBNAL_CKSUM
-        tx->tx_msg->oibm_nob = nob;
+        tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+        tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+        tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+        tx->tx_msg->ibm_nob = nob;
 #endif
         /* Fence the message if it's bundled with an RDMA read */
         fence = (tx->tx_nsp > 0) &&
-                (type == OPENIBNAL_MSG_PUT_DONE);
+                (type == IBNAL_MSG_PUT_DONE);
 
         *gl = (struct ib_gather_scatter) {
                 .address = tx->tx_vaddr,
                 .length  = nob,
-                .key     = koibnal_data.koib_tx_pages->oibp_lkey,
+                .key     = kibnal_data.kib_tx_pages->ibp_lkey,
         };
 
         /* NB If this is an RDMA read, the completion message must wait for
          * the RDMA to complete.  Sends wait for previous RDMA writes
          * anyway... */
         *sp = (struct ib_send_param) {
-                .work_request_id      = (__u64)((unsigned long)tx),
+                .work_request_id      = kibnal_ptr2wreqid(tx, 0),
                 .op                   = IB_OP_SEND,
                 .gather_list          = gl,
                 .num_gather_entries   = 1,
@@ -979,26 +986,26 @@ koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob)
 }
 
 void
-koibnal_queue_tx (koib_tx_t *tx, koib_conn_t *conn)
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
 {
         unsigned long         flags;
 
         spin_lock_irqsave(&conn->ibc_lock, flags);
 
-        koibnal_queue_tx_locked (tx, conn);
+        kibnal_queue_tx_locked (tx, conn);
         
         spin_unlock_irqrestore(&conn->ibc_lock, flags);
         
-        koibnal_check_sends(conn);
+        kibnal_check_sends(conn);
 }
 
 void
-koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
 {
         unsigned long    flags;
-        koib_peer_t     *peer;
-        koib_conn_t     *conn;
-        rwlock_t        *g_lock = &koibnal_data.koib_global_lock;
+        kib_peer_t      *peer;
+        kib_conn_t      *conn;
+        rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
 
         /* If I get here, I've committed to send, so I complete the tx with
          * failure on any problems */
@@ -1008,15 +1015,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
 
         read_lock (g_lock);
         
-        peer = koibnal_find_peer_locked (nid);
+        peer = kibnal_find_peer_locked (nid);
         if (peer == NULL) {
                 read_unlock (g_lock);
                 tx->tx_status = -EHOSTUNREACH;
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
                 return;
         }
 
-        conn = koibnal_find_conn_locked (peer);
+        conn = kibnal_find_conn_locked (peer);
         if (conn != NULL) {
                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
@@ -1024,7 +1031,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
                 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
                 read_unlock (g_lock);
                 
-                koibnal_queue_tx (tx, conn);
+                kibnal_queue_tx (tx, conn);
                 return;
         }
         
@@ -1032,15 +1039,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
         read_unlock (g_lock);
         write_lock_irqsave (g_lock, flags);
 
-        peer = koibnal_find_peer_locked (nid);
+        peer = kibnal_find_peer_locked (nid);
         if (peer == NULL) {
                 write_unlock_irqrestore (g_lock, flags);
                 tx->tx_status = -EHOSTUNREACH;
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
                 return;
         }
 
-        conn = koibnal_find_conn_locked (peer);
+        conn = kibnal_find_conn_locked (peer);
         if (conn != NULL) {
                 /* Connection exists; queue message on it */
                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
@@ -1049,7 +1056,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
                 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
                 write_unlock_irqrestore (g_lock, flags);
                 
-                koibnal_queue_tx (tx, conn);
+                kibnal_queue_tx (tx, conn);
                 return;
         }
 
@@ -1057,20 +1064,20 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
                 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
                         write_unlock_irqrestore (g_lock, flags);
                         tx->tx_status = -EHOSTUNREACH;
-                        koibnal_tx_done (tx);
+                        kibnal_tx_done (tx);
                         return;
                 }
         
                 peer->ibp_connecting = 1;
                 atomic_inc (&peer->ibp_refcount); /* extra ref for connd */
         
-                spin_lock (&koibnal_data.koib_connd_lock);
+                spin_lock (&kibnal_data.kib_connd_lock);
         
                 list_add_tail (&peer->ibp_connd_list,
-                               &koibnal_data.koib_connd_peers);
-                wake_up (&koibnal_data.koib_connd_waitq);
+                               &kibnal_data.kib_connd_peers);
+                wake_up (&kibnal_data.kib_connd_waitq);
         
-                spin_unlock (&koibnal_data.koib_connd_lock);
+                spin_unlock (&kibnal_data.kib_connd_lock);
         }
         
         /* A connection is being established; queue the message... */
@@ -1080,49 +1087,49 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
 }
 
 ptl_err_t
-koibnal_start_passive_rdma (int type, ptl_nid_t nid,
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
                             lib_msg_t *libmsg, ptl_hdr_t *hdr)
 {
         int         nob = libmsg->md->length;
-        koib_tx_t  *tx;
-        koib_msg_t *oibmsg;
+        kib_tx_t   *tx;
+        kib_msg_t  *ibmsg;
         int         rc;
         int         access;
         
-        LASSERT (type == OPENIBNAL_MSG_PUT_RDMA || 
-                 type == OPENIBNAL_MSG_GET_RDMA);
+        LASSERT (type == IBNAL_MSG_PUT_RDMA || 
+                 type == IBNAL_MSG_GET_RDMA);
         LASSERT (nob > 0);
         LASSERT (!in_interrupt());              /* Mapping could block */
 
-        if (type == OPENIBNAL_MSG_PUT_RDMA) {
+        if (type == IBNAL_MSG_PUT_RDMA) {
                 access = IB_ACCESS_REMOTE_READ;
         } else {
                 access = IB_ACCESS_REMOTE_WRITE |
                          IB_ACCESS_LOCAL_WRITE;
         }
 
-        tx = koibnal_get_idle_tx (1);           /* May block; caller is an app thread */
+        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
         LASSERT (tx != NULL);
 
         if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
-                rc = koibnal_map_iov (tx, access,
-                                      libmsg->md->md_niov,
-                                      libmsg->md->md_iov.iov,
-                                      0, nob);
+                rc = kibnal_map_iov (tx, access,
+                                     libmsg->md->md_niov,
+                                     libmsg->md->md_iov.iov,
+                                     0, nob);
         else
-                rc = koibnal_map_kiov (tx, access,
-                                       libmsg->md->md_niov, 
-                                       libmsg->md->md_iov.kiov,
-                                       0, nob);
+                rc = kibnal_map_kiov (tx, access,
+                                      libmsg->md->md_niov, 
+                                      libmsg->md->md_iov.kiov,
+                                      0, nob);
 
         if (rc != 0) {
                 CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
                 goto failed;
         }
         
-        if (type == OPENIBNAL_MSG_GET_RDMA) {
+        if (type == IBNAL_MSG_GET_RDMA) {
                 /* reply gets finalized when tx completes */
-                tx->tx_libmsg[1] = lib_create_reply_msg(&koibnal_lib, 
+                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, 
                                                         nid, libmsg);
                 if (tx->tx_libmsg[1] == NULL) {
                         CERROR ("Can't create reply for GET -> "LPX64"\n",
@@ -1134,15 +1141,15 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid,
         
         tx->tx_passive_rdma = 1;
 
-        oibmsg = tx->tx_msg;
+        ibmsg = tx->tx_msg;
 
-        oibmsg->oibm_u.rdma.oibrm_hdr = *hdr;
-        oibmsg->oibm_u.rdma.oibrm_cookie = tx->tx_passive_rdma_cookie;
-        oibmsg->oibm_u.rdma.oibrm_desc.rd_key = tx->tx_md.md_rkey;
-        oibmsg->oibm_u.rdma.oibrm_desc.rd_addr = tx->tx_md.md_addr;
-        oibmsg->oibm_u.rdma.oibrm_desc.rd_nob = nob;
+        ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+        ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+        ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey;
+        ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr;
+        ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob;
 
-        koibnal_init_tx_msg (tx, type, sizeof (koib_rdma_msg_t));
+        kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t));
 
         CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
                LPX64", nob %d\n",
@@ -1152,25 +1159,25 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid,
         /* libmsg gets finalized when tx completes. */
         tx->tx_libmsg[0] = libmsg;
 
-        koibnal_launch_tx(tx, nid);
+        kibnal_launch_tx(tx, nid);
         return (PTL_OK);
 
  failed:
         tx->tx_status = rc;
-        koibnal_tx_done (tx);
+        kibnal_tx_done (tx);
         return (PTL_FAIL);
 }
 
 void
-koibnal_start_active_rdma (int type, int status,
-                           koib_rx_t *rx, lib_msg_t *libmsg, 
+kibnal_start_active_rdma (int type, int status,
+                           kib_rx_t *rx, lib_msg_t *libmsg, 
                            unsigned int niov,
                            struct iovec *iov, ptl_kiov_t *kiov,
                            size_t offset, size_t nob)
 {
-        koib_msg_t   *rxmsg = rx->rx_msg;
-        koib_msg_t   *txmsg;
-        koib_tx_t    *tx;
+        kib_msg_t    *rxmsg = rx->rx_msg;
+        kib_msg_t    *txmsg;
+        kib_tx_t     *tx;
         int           access;
         int           rdma_op;
         int           rc;
@@ -1187,8 +1194,8 @@ koibnal_start_active_rdma (int type, int status,
         /* No data if we're completing with failure */
         LASSERT (status == 0 || nob == 0);
 
-        LASSERT (type == OPENIBNAL_MSG_GET_DONE ||
-                 type == OPENIBNAL_MSG_PUT_DONE);
+        LASSERT (type == IBNAL_MSG_GET_DONE ||
+                 type == IBNAL_MSG_PUT_DONE);
 
         /* Flag I'm completing the RDMA.  Even if I fail to send the
          * completion message, I will have tried my best so further
@@ -1196,22 +1203,22 @@ koibnal_start_active_rdma (int type, int status,
         LASSERT (!rx->rx_rdma);
         rx->rx_rdma = 1;
 
-        if (type == OPENIBNAL_MSG_GET_DONE) {
+        if (type == IBNAL_MSG_GET_DONE) {
                 access   = 0;
                 rdma_op  = IB_OP_RDMA_WRITE;
-                LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_GET_RDMA);
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
         } else {
                 access   = IB_ACCESS_LOCAL_WRITE;
                 rdma_op  = IB_OP_RDMA_READ;
-                LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_PUT_RDMA);
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
         }
 
-        tx = koibnal_get_idle_tx (0);           /* Mustn't block */
+        tx = kibnal_get_idle_tx (0);           /* Mustn't block */
         if (tx == NULL) {
                 CERROR ("tx descs exhausted on RDMA from "LPX64
                         " completing locally with failure\n",
-                         rx->rx_conn->ibc_peer->ibp_nid);
-                lib_finalize (&koibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
                 return;
         }
         LASSERT (tx->tx_nsp == 0);
@@ -1222,11 +1229,11 @@ koibnal_start_active_rdma (int type, int status,
                  * message is matched) */
 
                 if (kiov != NULL)
-                        rc = koibnal_map_kiov (tx, access,
-                                               niov, kiov, offset, nob);
+                        rc = kibnal_map_kiov (tx, access,
+                                              niov, kiov, offset, nob);
                 else
-                        rc = koibnal_map_iov (tx, access,
-                                              niov, iov, offset, nob);
+                        rc = kibnal_map_iov (tx, access,
+                                             niov, iov, offset, nob);
                 
                 if (rc != 0) {
                         CERROR ("Can't map RDMA -> "LPX64": %d\n", 
@@ -1242,12 +1249,12 @@ koibnal_start_active_rdma (int type, int status,
                         };
                 
                         tx->tx_sp[0] = (struct ib_send_param) {
-                                .work_request_id      = (__u64)((unsigned long)tx),
+                                .work_request_id      = kibnal_ptr2wreqid(tx, 0),
                                 .op                   = rdma_op,
                                 .gather_list          = &tx->tx_gl[0],
                                 .num_gather_entries   = 1,
-                                .remote_address       = rxmsg->oibm_u.rdma.oibrm_desc.rd_addr,
-                                .rkey                 = rxmsg->oibm_u.rdma.oibrm_desc.rd_key,
+                                .remote_address       = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr,
+                                .rkey                 = rxmsg->ibm_u.rdma.ibrm_desc.rd_key,
                                 .device_specific      = NULL,
                                 .solicited_event      = 0,
                                 .signaled             = 1,
@@ -1262,10 +1269,10 @@ koibnal_start_active_rdma (int type, int status,
 
         txmsg = tx->tx_msg;
 
-        txmsg->oibm_u.completion.oibcm_cookie = rxmsg->oibm_u.rdma.oibrm_cookie;
-        txmsg->oibm_u.completion.oibcm_status = status;
+        txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+        txmsg->ibm_u.completion.ibcm_status = status;
         
-        koibnal_init_tx_msg(tx, type, sizeof (koib_completion_msg_t));
+        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
 
         if (status == 0 && nob != 0) {
                 LASSERT (tx->tx_nsp > 1);
@@ -1277,7 +1284,7 @@ koibnal_start_active_rdma (int type, int status,
                 LASSERT (tx->tx_nsp == 1);
                 /* No RDMA: local completion happens now! */
                 CDEBUG(D_WARNING,"No data: immediate completion\n");
-                lib_finalize (&koibnal_lib, NULL, libmsg,
+                lib_finalize (&kibnal_lib, NULL, libmsg,
                               status == 0 ? PTL_OK : PTL_FAIL);
         }
 
@@ -1288,11 +1295,11 @@ koibnal_start_active_rdma (int type, int status,
                atomic_read (&rx->rx_conn->ibc_refcount));
         atomic_inc (&rx->rx_conn->ibc_refcount);
         /* ...and queue it up */
-        koibnal_queue_tx(tx, rx->rx_conn);
+        kibnal_queue_tx(tx, rx->rx_conn);
 }
 
 ptl_err_t
-koibnal_sendmsg(lib_nal_t    *nal, 
+kibnal_sendmsg(lib_nal_t    *nal, 
                 void         *private,
                 lib_msg_t    *libmsg,
                 ptl_hdr_t    *hdr, 
@@ -1305,8 +1312,8 @@ koibnal_sendmsg(lib_nal_t    *nal,
                 size_t        payload_offset,
                 size_t        payload_nob)
 {
-        koib_msg_t *oibmsg;
-        koib_tx_t  *tx;
+        kib_msg_t  *ibmsg;
+        kib_tx_t   *tx;
         int         nob;
 
         /* NB 'private' is different depending on what we're sending.... */
@@ -1329,27 +1336,27 @@ koibnal_sendmsg(lib_nal_t    *nal,
                 
         case PTL_MSG_REPLY: {
                 /* reply's 'private' is the incoming receive */
-                koib_rx_t *rx = private;
+                kib_rx_t *rx = private;
 
                 /* RDMA reply expected? */
-                if (rx->rx_msg->oibm_type == OPENIBNAL_MSG_GET_RDMA) {
-                        koibnal_start_active_rdma(OPENIBNAL_MSG_GET_DONE, 0,
-                                                  rx, libmsg, payload_niov, 
-                                                  payload_iov, payload_kiov,
-                                                  payload_offset, payload_nob);
+                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+                                                 rx, libmsg, payload_niov, 
+                                                 payload_iov, payload_kiov,
+                                                 payload_offset, payload_nob);
                         return (PTL_OK);
                 }
                 
                 /* Incoming message consistent with immediate reply? */
-                if (rx->rx_msg->oibm_type != OPENIBNAL_MSG_IMMEDIATE) {
+                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
                         CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
-                                nid, rx->rx_msg->oibm_type);
+                                nid, rx->rx_msg->ibm_type);
                         return (PTL_FAIL);
                 }
 
                 /* Will it fit in a message? */
-                nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]);
-                if (nob >= OPENIBNAL_MSG_SIZE) {
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob >= IBNAL_MSG_SIZE) {
                         CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", 
                                nid, payload_nob);
                         return (PTL_FAIL);
@@ -1359,10 +1366,10 @@ koibnal_sendmsg(lib_nal_t    *nal,
 
         case PTL_MSG_GET:
                 /* might the REPLY message be big enough to need RDMA? */
-                nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[libmsg->md->length]);
-                if (nob > OPENIBNAL_MSG_SIZE)
-                        return (koibnal_start_passive_rdma(OPENIBNAL_MSG_GET_RDMA, 
-                                                           nid, libmsg, hdr));
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
+                                                          nid, libmsg, hdr));
                 break;
 
         case PTL_MSG_ACK:
@@ -1371,181 +1378,181 @@ koibnal_sendmsg(lib_nal_t    *nal,
 
         case PTL_MSG_PUT:
                 /* Is the payload big enough to need RDMA? */
-                nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]);
-                if (nob > OPENIBNAL_MSG_SIZE)
-                        return (koibnal_start_passive_rdma(OPENIBNAL_MSG_PUT_RDMA,
-                                                           nid, libmsg, hdr));
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+                                                          nid, libmsg, hdr));
                 
                 break;
         }
 
-        tx = koibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
-                                   type == PTL_MSG_REPLY ||
-                                   in_interrupt()));
+        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt()));
         if (tx == NULL) {
                 CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
                         type, nid, in_interrupt() ? " (intr)" : "");
                 return (PTL_NO_SPACE);
         }
 
-        oibmsg = tx->tx_msg;
-        oibmsg->oibm_u.immediate.oibim_hdr = *hdr;
+        ibmsg = tx->tx_msg;
+        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
 
         if (payload_nob > 0) {
                 if (payload_kiov != NULL)
-                        lib_copy_kiov2buf(oibmsg->oibm_u.immediate.oibim_payload,
+                        lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
                                           payload_niov, payload_kiov,
                                           payload_offset, payload_nob);
                 else
-                        lib_copy_iov2buf(oibmsg->oibm_u.immediate.oibim_payload,
+                        lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
                                          payload_niov, payload_iov,
                                          payload_offset, payload_nob);
         }
 
-        koibnal_init_tx_msg (tx, OPENIBNAL_MSG_IMMEDIATE,
-                             offsetof(koib_immediate_msg_t, 
-                                      oibim_payload[payload_nob]));
+        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+                            offsetof(kib_immediate_msg_t, 
+                                     ibim_payload[payload_nob]));
 
         /* libmsg gets finalized when tx completes */
         tx->tx_libmsg[0] = libmsg;
 
-        koibnal_launch_tx(tx, nid);
+        kibnal_launch_tx(tx, nid);
         return (PTL_OK);
 }
 
 ptl_err_t
-koibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                unsigned int payload_niov, struct iovec *payload_iov,
                size_t payload_offset, size_t payload_len)
 {
-        return (koibnal_sendmsg(nal, private, cookie,
-                                 hdr, type, nid, pid,
-                                 payload_niov, payload_iov, NULL,
-                                 payload_offset, payload_len));
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, payload_iov, NULL,
+                               payload_offset, payload_len));
 }
 
 ptl_err_t
-koibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
                      size_t payload_offset, size_t payload_len)
 {
-        return (koibnal_sendmsg(nal, private, cookie,
-                                 hdr, type, nid, pid,
-                                 payload_niov, NULL, payload_kiov,
-                                 payload_offset, payload_len));
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, NULL, payload_kiov,
+                               payload_offset, payload_len));
 }
 
 ptl_err_t
-koibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
                  unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
                  size_t offset, size_t mlen, size_t rlen)
 {
-        koib_rx_t                *rx = private;
-        koib_msg_t               *rxmsg = rx->rx_msg;
-        int                       msg_nob;
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        int          msg_nob;
         
         LASSERT (mlen <= rlen);
         LASSERT (!in_interrupt ());
         /* Either all pages or all vaddrs */
         LASSERT (!(kiov != NULL && iov != NULL));
 
-        switch (rxmsg->oibm_type) {
+        switch (rxmsg->ibm_type) {
         default:
                 LBUG();
                 return (PTL_FAIL);
                 
-        case OPENIBNAL_MSG_IMMEDIATE:
-                msg_nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[rlen]);
-                if (msg_nob > OPENIBNAL_MSG_SIZE) {
+        case IBNAL_MSG_IMMEDIATE:
+                msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (msg_nob > IBNAL_MSG_SIZE) {
                         CERROR ("Immediate message from "LPX64" too big: %d\n",
-                                rxmsg->oibm_u.immediate.oibim_hdr.src_nid, rlen);
+                                rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
                         return (PTL_FAIL);
                 }
 
                 if (kiov != NULL)
                         lib_copy_buf2kiov(niov, kiov, offset,
-                                          rxmsg->oibm_u.immediate.oibim_payload,
+                                          rxmsg->ibm_u.immediate.ibim_payload,
                                           mlen);
                 else
                         lib_copy_buf2iov(niov, iov, offset,
-                                         rxmsg->oibm_u.immediate.oibim_payload,
+                                         rxmsg->ibm_u.immediate.ibim_payload,
                                          mlen);
 
                 lib_finalize (nal, NULL, libmsg, PTL_OK);
                 return (PTL_OK);
 
-        case OPENIBNAL_MSG_GET_RDMA:
+        case IBNAL_MSG_GET_RDMA:
                 /* We get called here just to discard any junk after the
                  * GET hdr. */
                 LASSERT (libmsg == NULL);
                 lib_finalize (nal, NULL, libmsg, PTL_OK);
                 return (PTL_OK);
 
-        case OPENIBNAL_MSG_PUT_RDMA:
-                koibnal_start_active_rdma (OPENIBNAL_MSG_PUT_DONE, 0,
-                                           rx, libmsg, 
-                                           niov, iov, kiov, offset, mlen);
+        case IBNAL_MSG_PUT_RDMA:
+                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+                                          rx, libmsg, 
+                                          niov, iov, kiov, offset, mlen);
                 return (PTL_OK);
         }
 }
 
 ptl_err_t
-koibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
               unsigned int niov, struct iovec *iov, 
               size_t offset, size_t mlen, size_t rlen)
 {
-        return (koibnal_recvmsg (nal, private, msg, niov, iov, NULL,
-                                 offset, mlen, rlen));
+        return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+                                offset, mlen, rlen));
 }
 
 ptl_err_t
-koibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
                      unsigned int niov, ptl_kiov_t *kiov, 
                      size_t offset, size_t mlen, size_t rlen)
 {
-        return (koibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
-                                 offset, mlen, rlen));
+        return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+                                offset, mlen, rlen));
 }
 
 int
-koibnal_thread_start (int (*fn)(void *arg), void *arg)
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
 {
         long    pid = kernel_thread (fn, arg, 0);
 
         if (pid < 0)
                 return ((int)pid);
 
-        atomic_inc (&koibnal_data.koib_nthreads);
+        atomic_inc (&kibnal_data.kib_nthreads);
         return (0);
 }
 
 void
-koibnal_thread_fini (void)
+kibnal_thread_fini (void)
 {
-        atomic_dec (&koibnal_data.koib_nthreads);
+        atomic_dec (&kibnal_data.kib_nthreads);
 }
 
 void
-koibnal_close_conn_locked (koib_conn_t *conn, int error)
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
 {
         /* This just does the immmediate housekeeping, and schedules the
          * connection for the connd to finish off.
-         * Caller holds koib_global_lock exclusively in irq context */
-        koib_peer_t   *peer = conn->ibc_peer;
+         * Caller holds kib_global_lock exclusively in irq context */
+        kib_peer_t   *peer = conn->ibc_peer;
 
         CDEBUG (error == 0 ? D_NET : D_ERROR,
                 "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
         
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED ||
-                 conn->ibc_state == OPENIBNAL_CONN_CONNECTING);
+        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED ||
+                 conn->ibc_state == IBNAL_CONN_CONNECTING);
 
-        if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
-                /* koib_connd_conns takes ibc_list's ref */
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                /* kib_connd_conns takes ibc_list's ref */
                 list_del (&conn->ibc_list);
         } else {
-                /* new ref for koib_connd_conns */
+                /* new ref for kib_connd_conns */
                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                        atomic_read (&conn->ibc_refcount));
@@ -1555,57 +1562,57 @@ koibnal_close_conn_locked (koib_conn_t *conn, int error)
         if (list_empty (&peer->ibp_conns) &&
             peer->ibp_persistence == 0) {
                 /* Non-persistent peer with no more conns... */
-                koibnal_unlink_peer_locked (peer);
+                kibnal_unlink_peer_locked (peer);
         }
 
-        conn->ibc_state = OPENIBNAL_CONN_DEATHROW;
+        conn->ibc_state = IBNAL_CONN_DEATHROW;
 
         /* Schedule conn for closing/destruction */
-        spin_lock (&koibnal_data.koib_connd_lock);
+        spin_lock (&kibnal_data.kib_connd_lock);
 
-        list_add_tail (&conn->ibc_list, &koibnal_data.koib_connd_conns);
-        wake_up (&koibnal_data.koib_connd_waitq);
+        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
                 
-        spin_unlock (&koibnal_data.koib_connd_lock);
+        spin_unlock (&kibnal_data.kib_connd_lock);
 }
 
 int
-koibnal_close_conn (koib_conn_t *conn, int why)
+kibnal_close_conn (kib_conn_t *conn, int why)
 {
         unsigned long     flags;
         int               count = 0;
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        LASSERT (conn->ibc_state >= OPENIBNAL_CONN_CONNECTING);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
         
-        if (conn->ibc_state <= OPENIBNAL_CONN_ESTABLISHED) {
+        if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) {
                 count = 1;
-                koibnal_close_conn_locked (conn, why);
+                kibnal_close_conn_locked (conn, why);
         }
         
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
         return (count);
 }
 
 void
-koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc)
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
 {
         LIST_HEAD        (zombies);
-        koib_tx_t        *tx;
+        kib_tx_t         *tx;
         unsigned long     flags;
 
         LASSERT (rc != 0);
-        LASSERT (peer->ibp_reconnect_interval >= OPENIBNAL_MIN_RECONNECT_INTERVAL);
+        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         LASSERT (peer->ibp_connecting != 0);
         peer->ibp_connecting--;
 
         if (peer->ibp_connecting != 0) {
                 /* another connection attempt under way (loopback?)... */
-                write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
                 return;
         }
 
@@ -1614,50 +1621,50 @@ koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc)
                 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
                 /* Increase reconnection interval */
                 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
-                                                    OPENIBNAL_MAX_RECONNECT_INTERVAL);
+                                                    IBNAL_MAX_RECONNECT_INTERVAL);
         
                 /* Take peer's blocked blocked transmits; I'll complete
                  * them with error */
                 while (!list_empty (&peer->ibp_tx_queue)) {
                         tx = list_entry (peer->ibp_tx_queue.next,
-                                         koib_tx_t, tx_list);
+                                         kib_tx_t, tx_list);
                         
                         list_del (&tx->tx_list);
                         list_add_tail (&tx->tx_list, &zombies);
                 }
                 
-                if (koibnal_peer_active(peer) &&
+                if (kibnal_peer_active(peer) &&
                     (peer->ibp_persistence == 0)) {
                         /* failed connection attempt on non-persistent peer */
-                        koibnal_unlink_peer_locked (peer);
+                        kibnal_unlink_peer_locked (peer);
                 }
         } else {
                 /* Can't have blocked transmits if there are connections */
                 LASSERT (list_empty(&peer->ibp_tx_queue));
         }
         
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         if (!list_empty (&zombies))
                 CERROR ("Deleting messages for "LPX64": connection failed\n",
                         peer->ibp_nid);
 
         while (!list_empty (&zombies)) {
-                tx = list_entry (zombies.next, koib_tx_t, tx_list);
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
 
                 list_del (&tx->tx_list);
                 /* complete now */
                 tx->tx_status = -EHOSTUNREACH;
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
         }
 }
 
 void
-koibnal_connreq_done (koib_conn_t *conn, int active, int status)
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
 {
         int               state = conn->ibc_state;
-        koib_peer_t      *peer = conn->ibc_peer;
-        koib_tx_t        *tx;
+        kib_peer_t       *peer = conn->ibc_peer;
+        kib_tx_t         *tx;
         unsigned long     flags;
         int               rc;
         int               i;
@@ -1669,31 +1676,31 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                 conn->ibc_connreq = NULL;
         }
 
-        if (state == OPENIBNAL_CONN_CONNECTING) {
+        if (state == IBNAL_CONN_CONNECTING) {
                 /* Install common (active/passive) callback for
                  * disconnect/idle notification if I got as far as getting
                  * a CM comm_id */
                 rc = tsIbCmCallbackModify(conn->ibc_comm_id, 
-                                          koibnal_conn_callback, conn);
+                                          kibnal_conn_callback, conn);
                 LASSERT (rc == 0);
         }
         
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         LASSERT (peer->ibp_connecting != 0);
         
         if (status == 0) {                         
                 /* connection established... */
-                LASSERT (state == OPENIBNAL_CONN_CONNECTING);
-                conn->ibc_state = OPENIBNAL_CONN_ESTABLISHED;
+                LASSERT (state == IBNAL_CONN_CONNECTING);
+                conn->ibc_state = IBNAL_CONN_ESTABLISHED;
 
-                if (!koibnal_peer_active(peer)) {
+                if (!kibnal_peer_active(peer)) {
                         /* ...but peer deleted meantime */
                         status = -ECONNABORTED;
                 }
         } else {
-                LASSERT (state == OPENIBNAL_CONN_INIT_QP ||
-                         state == OPENIBNAL_CONN_CONNECTING);
+                LASSERT (state == IBNAL_CONN_INIT_QP ||
+                         state == IBNAL_CONN_CONNECTING);
         }
 
         if (status == 0) {
@@ -1710,14 +1717,14 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                 list_add (&conn->ibc_list, &peer->ibp_conns);
                 
                 /* reset reconnect interval for next attempt */
-                peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
+                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
 
                 /* post blocked sends to the new connection */
                 spin_lock (&conn->ibc_lock);
                 
                 while (!list_empty (&peer->ibp_tx_queue)) {
                         tx = list_entry (peer->ibp_tx_queue.next, 
-                                         koib_tx_t, tx_list);
+                                         kib_tx_t, tx_list);
                         
                         list_del (&tx->tx_list);
 
@@ -1726,19 +1733,19 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                                atomic_read (&conn->ibc_refcount));
                         atomic_inc (&conn->ibc_refcount);
-                        koibnal_queue_tx_locked (tx, conn);
+                        kibnal_queue_tx_locked (tx, conn);
                 }
                 
                 spin_unlock (&conn->ibc_lock);
 
                 /* Nuke any dangling conns from a different peer instance... */
-                koibnal_close_stale_conns_locked (conn->ibc_peer,
-                                                  conn->ibc_incarnation);
+                kibnal_close_stale_conns_locked (conn->ibc_peer,
+                                                 conn->ibc_incarnation);
 
-                write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
                 /* queue up all the receives */
-                for (i = 0; i < OPENIBNAL_RX_MSGS; i++) {
+                for (i = 0; i < IBNAL_RX_MSGS; i++) {
                         /* +1 ref for rx desc */
                         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
@@ -1749,71 +1756,71 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                                i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
                                conn->ibc_rxs[i].rx_vaddr);
 
-                        koibnal_post_rx (&conn->ibc_rxs[i], 0);
+                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
                 }
 
-                koibnal_check_sends (conn);
+                kibnal_check_sends (conn);
                 return;
         }
 
         /* connection failed */
-        if (state == OPENIBNAL_CONN_CONNECTING) {
+        if (state == IBNAL_CONN_CONNECTING) {
                 /* schedule for connd to close */
-                koibnal_close_conn_locked (conn, status);
+                kibnal_close_conn_locked (conn, status);
         } else {
                 /* Don't have a CM comm_id; just wait for refs to drain */
-                conn->ibc_state = OPENIBNAL_CONN_ZOMBIE;
+                conn->ibc_state = IBNAL_CONN_ZOMBIE;
         } 
 
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
-        koibnal_peer_connect_failed (conn->ibc_peer, active, status);
+        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
 
-        if (state != OPENIBNAL_CONN_CONNECTING) {
+        if (state != IBNAL_CONN_CONNECTING) {
                 /* drop caller's ref if we're not waiting for the
                  * IB_CM_IDLE callback */
-                koibnal_put_conn (conn);
+                kibnal_put_conn (conn);
         }
 }
 
 int
-koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
+kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
                 ptl_nid_t nid, __u64 incarnation, int queue_depth)
 {
-        koib_conn_t   *conn = koibnal_create_conn();
-        koib_peer_t   *peer;
-        koib_peer_t   *peer2;
+        kib_conn_t    *conn = kibnal_create_conn();
+        kib_peer_t    *peer;
+        kib_peer_t    *peer2;
         unsigned long  flags;
 
         if (conn == NULL)
                 return (-ENOMEM);
 
-        if (queue_depth != OPENIBNAL_MSG_QUEUE_SIZE) {
+        if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
                 CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
-                       nid, queue_depth, OPENIBNAL_MSG_QUEUE_SIZE);
+                       nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
                 return (-EPROTO);
         }
         
         /* assume 'nid' is a new peer */
-        peer = koibnal_create_peer (nid);
+        peer = kibnal_create_peer (nid);
         if (peer == NULL) {
                 CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                        atomic_read (&conn->ibc_refcount));
                 atomic_dec (&conn->ibc_refcount);
-                koibnal_destroy_conn(conn);
+                kibnal_destroy_conn(conn);
                 return (-ENOMEM);
         }
         
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        peer2 = koibnal_find_peer_locked(nid);
+        peer2 = kibnal_find_peer_locked(nid);
         if (peer2 == NULL) {
                 /* peer table takes my ref on peer */
                 list_add_tail (&peer->ibp_list,
-                               koibnal_nid2peerlist(nid));
+                               kibnal_nid2peerlist(nid));
         } else {
-                koibnal_put_peer (peer);
+                kibnal_put_peer (peer);
                 peer = peer2;
         }
 
@@ -1821,20 +1828,20 @@ koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
         atomic_inc (&peer->ibp_refcount);
         peer->ibp_connecting++;
 
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         conn->ibc_peer = peer;
-        conn->ibc_state = OPENIBNAL_CONN_CONNECTING;
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
         conn->ibc_comm_id = cid;
         conn->ibc_incarnation = incarnation;
-        conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE;
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
 
         *connp = conn;
         return (0);
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
                             tTS_IB_CM_COMM_ID cid,
                             void *param,
                             void *arg)
@@ -1846,13 +1853,19 @@ koibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_conn_callback (tTS_IB_CM_EVENT event,
                        tTS_IB_CM_COMM_ID cid,
                        void *param,
                        void *arg)
 {
-        koib_conn_t *conn = arg;
-        int          rc;
+        kib_conn_t       *conn = arg;
+        LIST_HEAD        (zombies); 
+        struct list_head *tmp;
+        struct list_head *nxt;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               done;
+        int               rc;
 
         /* Established Connection Notifier */
 
@@ -1860,24 +1873,72 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event,
         default:
                 CERROR("Connection %p -> "LPX64" ERROR %d\n",
                        conn, conn->ibc_peer->ibp_nid, event);
-                koibnal_close_conn (conn, -ECONNABORTED);
+                kibnal_close_conn (conn, -ECONNABORTED);
                 break;
                 
         case TS_IB_CM_DISCONNECTED:
                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n",
                        conn, conn->ibc_peer->ibp_nid);
-                koibnal_close_conn (conn, 0);
+                kibnal_close_conn (conn, 0);
                 break;
 
         case TS_IB_CM_IDLE:
                 CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n",
                        conn, conn->ibc_peer->ibp_nid);
-                koibnal_put_conn (conn);        /* Lose CM's ref */
+                kibnal_put_conn (conn);        /* Lose CM's ref */
 
                 /* LASSERT (no further callbacks) */
                 rc = tsIbCmCallbackModify(cid, 
-                                          koibnal_idle_conn_callback, conn);
+                                          kibnal_idle_conn_callback, conn);
                 LASSERT (rc == 0);
+
+                /* NB we wait until the connection has closed before
+                 * completing outstanding passive RDMAs so we can be sure
+                 * the network can't touch the mapped memory any more. */
+
+                spin_lock_irqsave (&conn->ibc_lock, flags);
+
+                /* grab passive RDMAs not waiting for the tx callback */
+                list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+                        tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                        LASSERT (tx->tx_passive_rdma ||
+                                 !tx->tx_passive_rdma_wait);
+
+                        LASSERT (tx->tx_passive_rdma_wait ||
+                                 tx->tx_sending != 0);
+
+                        /* still waiting for tx callback? */
+                        if (!tx->tx_passive_rdma_wait)
+                                continue;
+
+                        tx->tx_status = -ECONNABORTED;
+                        tx->tx_passive_rdma_wait = 0;
+                        done = (tx->tx_sending == 0);
+
+                        if (!done)
+                                continue;
+
+                        list_del (&tx->tx_list);
+                        list_add (&tx->tx_list, &zombies);
+                }
+
+                /* grab all blocked transmits */
+                list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+                        tx = list_entry (tmp, kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+                        list_add (&tx->tx_list, &zombies);
+                }
+                
+                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+                while (!list_empty(&zombies)) {
+                        tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                        list_del(&tx->tx_list);
+                        kibnal_tx_done (tx);
+                }
                 break;
         }
 
@@ -1885,12 +1946,12 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event,
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                                tTS_IB_CM_COMM_ID cid,
                                void *param,
                                void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
         int          rc;
         
         switch (event) {
@@ -1903,12 +1964,12 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                 
                 CERROR ("Unexpected event %p -> "LPX64": %d\n", 
                         conn, conn->ibc_peer->ibp_nid, event);
-                koibnal_connreq_done (conn, 0, -ECONNABORTED);
+                kibnal_connreq_done (conn, 0, -ECONNABORTED);
                 break;
                 
         case TS_IB_CM_REQ_RECEIVED: {
                 struct ib_cm_req_received_param *req = param;
-                koib_wire_connreq_t             *wcr = req->remote_private_data;
+                kib_wire_connreq_t             *wcr = req->remote_private_data;
 
                 LASSERT (conn == NULL);
 
@@ -1920,23 +1981,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
 
-                if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) {
+                if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
                         CERROR ("Can't accept LID %04x: bad magic %08x\n",
                                 req->dlid, le32_to_cpu(wcr->wcr_magic));
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
                 
-                if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) {
+                if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
                         CERROR ("Can't accept LID %04x: bad version %d\n",
                                 req->dlid, le16_to_cpu(wcr->wcr_magic));
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
                                 
-                rc = koibnal_accept(&conn,
-                                    cid,
-                                    le64_to_cpu(wcr->wcr_nid),
-                                    le64_to_cpu(wcr->wcr_incarnation),
-                                    le16_to_cpu(wcr->wcr_queue_depth));
+                rc = kibnal_accept(&conn,
+                                   cid,
+                                   le64_to_cpu(wcr->wcr_nid),
+                                   le64_to_cpu(wcr->wcr_incarnation),
+                                   le16_to_cpu(wcr->wcr_queue_depth));
                 if (rc != 0) {
                         CERROR ("Can't accept "LPX64": %d\n",
                                 le64_to_cpu(wcr->wcr_nid), rc);
@@ -1945,23 +2006,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
 
                 /* update 'arg' for next callback */
                 rc = tsIbCmCallbackModify(cid, 
-                                          koibnal_passive_conn_callback, conn);
+                                          kibnal_passive_conn_callback, conn);
                 LASSERT (rc == 0);
 
                 req->accept_param.qp                     = conn->ibc_qp;
-                *((koib_wire_connreq_t *)req->accept_param.reply_private_data)
-                        = (koib_wire_connreq_t) {
-                                .wcr_magic       = cpu_to_le32(OPENIBNAL_MSG_MAGIC),
-                                .wcr_version     = cpu_to_le16(OPENIBNAL_MSG_VERSION),
-                                .wcr_queue_depth = cpu_to_le32(OPENIBNAL_MSG_QUEUE_SIZE),
-                                .wcr_nid         = cpu_to_le64(koibnal_data.koib_nid),
-                                .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation),
+                *((kib_wire_connreq_t *)req->accept_param.reply_private_data)
+                        = (kib_wire_connreq_t) {
+                                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                                .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+                                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
                         };
-                req->accept_param.reply_private_data_len = sizeof(koib_wire_connreq_t);
-                req->accept_param.responder_resources    = OPENIBNAL_RESPONDER_RESOURCES;
-                req->accept_param.initiator_depth        = OPENIBNAL_RESPONDER_RESOURCES;
-                req->accept_param.rnr_retry_count        = OPENIBNAL_RNR_RETRY;
-                req->accept_param.flow_control           = OPENIBNAL_FLOW_CONTROL;
+                req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t);
+                req->accept_param.responder_resources    = IBNAL_RESPONDER_RESOURCES;
+                req->accept_param.initiator_depth        = IBNAL_RESPONDER_RESOURCES;
+                req->accept_param.rnr_retry_count        = IBNAL_RNR_RETRY;
+                req->accept_param.flow_control           = IBNAL_FLOW_CONTROL;
 
                 CDEBUG(D_NET, "Proceeding\n");
                 break;
@@ -1972,60 +2033,60 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
                        conn, conn->ibc_peer->ibp_nid);
 
-                koibnal_connreq_done (conn, 0, 0);
+                kibnal_connreq_done (conn, 0, 0);
                 break;
         }
 
-        /* NB if the connreq is done, we switch to koibnal_conn_callback */
+        /* NB if the connreq is done, we switch to kibnal_conn_callback */
         return TS_IB_CM_CALLBACK_PROCEED;
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
                               tTS_IB_CM_COMM_ID cid,
                               void *param,
                               void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
 
         switch (event) {
         case TS_IB_CM_REP_RECEIVED: {
                 struct ib_cm_rep_received_param *rep = param;
-                koib_wire_connreq_t             *wcr = rep->remote_private_data;
+                kib_wire_connreq_t             *wcr = rep->remote_private_data;
 
                 if (rep->remote_private_data_len < sizeof (*wcr)) {
                         CERROR ("Short reply from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid,
                                 rep->remote_private_data_len);
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
 
-                if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) {
+                if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
                         CERROR ("Can't connect "LPX64": bad magic %08x\n",
                                 conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
                 
-                if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) {
+                if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
                         CERROR ("Can't connect "LPX64": bad version %d\n",
                                 conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
                                 
-                if (wcr->wcr_queue_depth != cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE)) {
+                if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
                         CERROR ("Can't connect "LPX64": bad queue depth %d\n",
                                 conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth));
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
                                 
                 if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
                         CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
                                 le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
 
@@ -2033,7 +2094,7 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
                        conn, conn->ibc_peer->ibp_nid);
 
                 conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
-                conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE;
+                conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
                 break;
         }
 
@@ -2041,86 +2102,86 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n",
                        conn, conn->ibc_peer->ibp_nid);
 
-                koibnal_connreq_done (conn, 1, 0);
+                kibnal_connreq_done (conn, 1, 0);
                 break;
 
         case TS_IB_CM_IDLE:
                 CERROR("Connection %p -> "LPX64" IDLE\n",
                        conn, conn->ibc_peer->ibp_nid);
                 /* Back out state change: I'm disengaged from CM */
-                conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
                 
-                koibnal_connreq_done (conn, 1, -ECONNABORTED);
+                kibnal_connreq_done (conn, 1, -ECONNABORTED);
                 break;
 
         default:
                 CERROR("Connection %p -> "LPX64" ERROR %d\n",
                        conn, conn->ibc_peer->ibp_nid, event);
-                koibnal_connreq_done (conn, 1, -ECONNABORTED);
+                kibnal_connreq_done (conn, 1, -ECONNABORTED);
                 break;
         }
 
-        /* NB if the connreq is done, we switch to koibnal_conn_callback */
+        /* NB if the connreq is done, we switch to kibnal_conn_callback */
         return TS_IB_CM_CALLBACK_PROCEED;
 }
 
 int
-koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
+kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
                           struct ib_path_record *resp, int remaining,
                           void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
         
         if (status != 0) {
                 CERROR ("status %d\n", status);
-                koibnal_connreq_done (conn, 1, status);
+                kibnal_connreq_done (conn, 1, status);
                 goto out;
         }
 
         conn->ibc_connreq->cr_path = *resp;
 
-        conn->ibc_connreq->cr_wcr = (koib_wire_connreq_t) {
-                .wcr_magic       = cpu_to_le32(OPENIBNAL_MSG_MAGIC),
-                .wcr_version     = cpu_to_le16(OPENIBNAL_MSG_VERSION),
-                .wcr_queue_depth = cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE),
-                .wcr_nid         = cpu_to_le64(koibnal_data.koib_nid),
-                .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation),
+        conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
         };
 
         conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) {
                 .qp                   = conn->ibc_qp,
                 .req_private_data     = &conn->ibc_connreq->cr_wcr,
                 .req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr),
-                .responder_resources  = OPENIBNAL_RESPONDER_RESOURCES,
-                .initiator_depth      = OPENIBNAL_RESPONDER_RESOURCES,
-                .retry_count          = OPENIBNAL_RETRY,
-                .rnr_retry_count      = OPENIBNAL_RNR_RETRY,
-                .cm_response_timeout  = koibnal_tunables.koib_io_timeout,
-                .max_cm_retries       = OPENIBNAL_CM_RETRY,
-                .flow_control         = OPENIBNAL_FLOW_CONTROL,
+                .responder_resources  = IBNAL_RESPONDER_RESOURCES,
+                .initiator_depth      = IBNAL_RESPONDER_RESOURCES,
+                .retry_count          = IBNAL_RETRY,
+                .rnr_retry_count      = IBNAL_RNR_RETRY,
+                .cm_response_timeout  = kibnal_tunables.kib_io_timeout,
+                .max_cm_retries       = IBNAL_CM_RETRY,
+                .flow_control         = IBNAL_FLOW_CONTROL,
         };
 
         /* XXX set timeout just like SDP!!!*/
         conn->ibc_connreq->cr_path.packet_life = 13;
         
         /* Flag I'm getting involved with the CM... */
-        conn->ibc_state = OPENIBNAL_CONN_CONNECTING;
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
 
         CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
                conn->ibc_connreq->cr_service.service_id, 
-               *koibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
 
-        /* koibnal_connect_callback gets my conn ref */
+        /* kibnal_connect_callback gets my conn ref */
         status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, 
                                 &conn->ibc_connreq->cr_path, NULL,
                                 conn->ibc_connreq->cr_service.service_id, 0,
-                                koibnal_active_conn_callback, conn,
+                                kibnal_active_conn_callback, conn,
                                 &conn->ibc_comm_id);
         if (status != 0) {
                 CERROR ("Connect: %d\n", status);
                 /* Back out state change: I've not got a CM comm_id yet... */
-                conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
-                koibnal_connreq_done (conn, 1, status);
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
+                kibnal_connreq_done (conn, 1, status);
         }
         
  out:
@@ -2129,58 +2190,58 @@ koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
 }
 
 void
-koibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
-                              struct ib_common_attrib_service *resp, void *arg)
+kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
+                             struct ib_common_attrib_service *resp, void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
         
         if (status != 0) {
                 CERROR ("status %d\n", status);
-                koibnal_connreq_done (conn, 1, status);
+                kibnal_connreq_done (conn, 1, status);
                 return;
         }
 
         CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
                status, resp->service_id, 
-               *koibnal_service_nid_field(resp));
+               *kibnal_service_nid_field(resp));
 
         conn->ibc_connreq->cr_service = *resp;
 
-        status = ib_cached_gid_get(koibnal_data.koib_device,
-                                   koibnal_data.koib_port, 0,
+        status = ib_cached_gid_get(kibnal_data.kib_device,
+                                   kibnal_data.kib_port, 0,
                                    conn->ibc_connreq->cr_gid);
         LASSERT (status == 0);
 
-        /* koibnal_pathreq_callback gets my conn ref */
-        status = tsIbPathRecordRequest (koibnal_data.koib_device,
-                                        koibnal_data.koib_port,
+        /* kibnal_pathreq_callback gets my conn ref */
+        status = tsIbPathRecordRequest (kibnal_data.kib_device,
+                                        kibnal_data.kib_port,
                                         conn->ibc_connreq->cr_gid,
                                         conn->ibc_connreq->cr_service.service_gid,
                                         conn->ibc_connreq->cr_service.service_pkey,
                                         0,
-                                        koibnal_tunables.koib_io_timeout * HZ,
+                                        kibnal_tunables.kib_io_timeout * HZ,
                                         0,
-                                        koibnal_pathreq_callback, conn, 
+                                        kibnal_pathreq_callback, conn, 
                                         &conn->ibc_connreq->cr_tid);
 
         if (status == 0)
                 return;
 
         CERROR ("Path record request: %d\n", status);
-        koibnal_connreq_done (conn, 1, status);
+        kibnal_connreq_done (conn, 1, status);
 }
 
 void
-koibnal_connect_peer (koib_peer_t *peer)
+kibnal_connect_peer (kib_peer_t *peer)
 {
-        koib_conn_t *conn = koibnal_create_conn();
+        kib_conn_t  *conn = kibnal_create_conn();
         int          rc;
 
         LASSERT (peer->ibp_connecting != 0);
 
         if (conn == NULL) {
                 CERROR ("Can't allocate conn\n");
-                koibnal_peer_connect_failed (peer, 1, -ENOMEM);
+                kibnal_peer_connect_failed (peer, 1, -ENOMEM);
                 return;
         }
 
@@ -2190,85 +2251,101 @@ koibnal_connect_peer (koib_peer_t *peer)
         PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
         if (conn->ibc_connreq == NULL) {
                 CERROR ("Can't allocate connreq\n");
-                koibnal_connreq_done (conn, 1, -ENOMEM);
+                kibnal_connreq_done (conn, 1, -ENOMEM);
                 return;
         }
 
         memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
 
-        koibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+        kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
 
-        /* koibnal_service_get_callback gets my conn ref */
-        rc = ib_service_get (koibnal_data.koib_device, 
-                             koibnal_data.koib_port,
+        /* kibnal_service_get_callback gets my conn ref */
+        rc = ib_service_get (kibnal_data.kib_device, 
+                             kibnal_data.kib_port,
                              &conn->ibc_connreq->cr_service,
-                             KOIBNAL_SERVICE_KEY_MASK,
-                             koibnal_tunables.koib_io_timeout * HZ,
-                             koibnal_service_get_callback, conn, 
+                             KIBNAL_SERVICE_KEY_MASK,
+                             kibnal_tunables.kib_io_timeout * HZ,
+                             kibnal_service_get_callback, conn, 
                              &conn->ibc_connreq->cr_tid);
         
         if (rc == 0)
                 return;
 
         CERROR ("ib_service_get: %d\n", rc);
-        koibnal_connreq_done (conn, 1, rc);
+        kibnal_connreq_done (conn, 1, rc);
 }
 
 int
-koibnal_conn_timed_out (koib_conn_t *conn)
+kibnal_conn_timed_out (kib_conn_t *conn)
 {
-        koib_tx_t         *tx;
+        kib_tx_t          *tx;
         struct list_head  *ttmp;
         unsigned long      flags;
-        int                rc = 0;
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
-        list_for_each (ttmp, &conn->ibc_rdma_queue) {
-                tx = list_entry (ttmp, koib_tx_t, tx_list);
+        list_for_each (ttmp, &conn->ibc_tx_queue) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
 
-                LASSERT (tx->tx_passive_rdma);
-                LASSERT (tx->tx_passive_rdma_wait);
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_sending == 0);
 
-                if (time_after_eq (jiffies, tx->tx_passive_rdma_deadline)) {
-                        rc = 1;
-                        break;
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
                 }
         }
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
         spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
-        return rc;
+        return 0;
 }
 
 void
-koibnal_check_conns (int idx)
+kibnal_check_conns (int idx)
 {
-        struct list_head  *peers = &koibnal_data.koib_peers[idx];
+        struct list_head  *peers = &kibnal_data.kib_peers[idx];
         struct list_head  *ptmp;
-        koib_peer_t       *peer;
-        koib_conn_t       *conn;
+        kib_peer_t        *peer;
+        kib_conn_t        *conn;
         struct list_head  *ctmp;
 
  again:
         /* NB. We expect to have a look at all the peers and not find any
          * rdmas to time out, so we just use a shared lock while we
          * take a look... */
-        read_lock (&koibnal_data.koib_global_lock);
+        read_lock (&kibnal_data.kib_global_lock);
 
         list_for_each (ptmp, peers) {
-                peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                peer = list_entry (ptmp, kib_peer_t, ibp_list);
 
                 list_for_each (ctmp, &peer->ibp_conns) {
-                        conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                        conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
 
-                        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED);
 
                         /* In case we have enough credits to return via a
                          * NOOP, but there were no non-blocking tx descs
                          * free to do it last time... */
-                        koibnal_check_sends(conn);
+                        kibnal_check_sends(conn);
 
-                        if (!koibnal_conn_timed_out(conn))
+                        if (!kibnal_conn_timed_out(conn))
                                 continue;
                         
                         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
@@ -2276,108 +2353,76 @@ koibnal_check_conns (int idx)
                                atomic_read (&conn->ibc_refcount));
 
                         atomic_inc (&conn->ibc_refcount);
-                        read_unlock (&koibnal_data.koib_global_lock);
+                        read_unlock (&kibnal_data.kib_global_lock);
 
                         CERROR("Timed out RDMA with "LPX64"\n",
                                peer->ibp_nid);
 
-                        koibnal_close_conn (conn, -ETIMEDOUT);
-                        koibnal_put_conn (conn);
+                        kibnal_close_conn (conn, -ETIMEDOUT);
+                        kibnal_put_conn (conn);
 
                         /* start again now I've dropped the lock */
                         goto again;
                 }
         }
 
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
 }
 
 void
-koibnal_terminate_conn (koib_conn_t *conn)
+kibnal_terminate_conn (kib_conn_t *conn)
 {
-        unsigned long flags;
         int           rc;
-        int           done;
 
         CDEBUG(D_NET, "conn %p\n", conn);
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_DEATHROW);
-        conn->ibc_state = OPENIBNAL_CONN_ZOMBIE;
+        LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW);
+        conn->ibc_state = IBNAL_CONN_ZOMBIE;
 
         rc = ib_cm_disconnect (conn->ibc_comm_id);
         if (rc != 0)
                 CERROR ("Error %d disconnecting conn %p -> "LPX64"\n",
                         rc, conn, conn->ibc_peer->ibp_nid);
-
-        /* complete blocked passive RDMAs */
-        spin_lock_irqsave (&conn->ibc_lock, flags);
-        
-        while (!list_empty (&conn->ibc_rdma_queue)) {
-                koib_tx_t *tx = list_entry (conn->ibc_rdma_queue.next,
-                                            koib_tx_t, tx_list);
-
-                LASSERT (tx->tx_passive_rdma);
-                LASSERT (tx->tx_passive_rdma_wait);
-                
-                list_del (&tx->tx_list);
-
-                tx->tx_passive_rdma_wait = 0;
-                done = (tx->tx_sending == 0);
-                
-                tx->tx_status = -ECONNABORTED;
-
-                spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-                if (done)
-                        koibnal_tx_done (tx);
-
-                spin_lock_irqsave (&conn->ibc_lock, flags);
-        }
-        
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-        /* Complete all blocked transmits */
-        koibnal_check_sends(conn);
 }
 
 int
-koibnal_connd (void *arg)
+kibnal_connd (void *arg)
 {
         wait_queue_t       wait;
         unsigned long      flags;
-        koib_conn_t       *conn;
-        koib_peer_t       *peer;
+        kib_conn_t        *conn;
+        kib_peer_t        *peer;
         int                timeout;
         int                i;
         int                peer_index = 0;
         unsigned long      deadline = jiffies;
         
-        kportal_daemonize ("koibnal_connd");
+        kportal_daemonize ("kibnal_connd");
         kportal_blockallsigs ();
 
         init_waitqueue_entry (&wait, current);
 
-        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
 
         for (;;) {
-                if (!list_empty (&koibnal_data.koib_connd_conns)) {
-                        conn = list_entry (koibnal_data.koib_connd_conns.next,
-                                           koib_conn_t, ibc_list);
+                if (!list_empty (&kibnal_data.kib_connd_conns)) {
+                        conn = list_entry (kibnal_data.kib_connd_conns.next,
+                                           kib_conn_t, ibc_list);
                         list_del (&conn->ibc_list);
                         
-                        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                         switch (conn->ibc_state) {
-                        case OPENIBNAL_CONN_DEATHROW:
+                        case IBNAL_CONN_DEATHROW:
                                 LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID);
                                 /* Disconnect: conn becomes a zombie in the
                                  * callback and last ref reschedules it
                                  * here... */
-                                koibnal_terminate_conn(conn);
-                                koibnal_put_conn (conn);
+                                kibnal_terminate_conn(conn);
+                                kibnal_put_conn (conn);
                                 break;
                                 
-                        case OPENIBNAL_CONN_ZOMBIE:
-                                koibnal_destroy_conn (conn);
+                        case IBNAL_CONN_ZOMBIE:
+                                kibnal_destroy_conn (conn);
                                 break;
                                 
                         default:
@@ -2386,35 +2431,35 @@ koibnal_connd (void *arg)
                                 LBUG();
                         }
 
-                        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
                         continue;
                 }
 
-                if (!list_empty (&koibnal_data.koib_connd_peers)) {
-                        peer = list_entry (koibnal_data.koib_connd_peers.next,
-                                           koib_peer_t, ibp_connd_list);
+                if (!list_empty (&kibnal_data.kib_connd_peers)) {
+                        peer = list_entry (kibnal_data.kib_connd_peers.next,
+                                           kib_peer_t, ibp_connd_list);
                         
                         list_del_init (&peer->ibp_connd_list);
-                        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
-                        koibnal_connect_peer (peer);
-                        koibnal_put_peer (peer);
+                        kibnal_connect_peer (peer);
+                        kibnal_put_peer (peer);
 
-                        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
                 }
 
                 /* shut down and nobody left to reap... */
-                if (koibnal_data.koib_shutdown &&
-                    atomic_read(&koibnal_data.koib_nconns) == 0)
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
                         break;
 
-                spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                 /* careful with the jiffy wrap... */
                 while ((timeout = (int)(deadline - jiffies)) <= 0) {
                         const int n = 4;
                         const int p = 1;
-                        int       chunk = koibnal_data.koib_peer_hash_size;
+                        int       chunk = kibnal_data.kib_peer_hash_size;
                         
                         /* Time to check for RDMA timeouts on a few more
                          * peers: I do checks every 'p' seconds on a
@@ -2424,129 +2469,129 @@ koibnal_connd (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-                        if (koibnal_tunables.koib_io_timeout > n * p)
+                        if (kibnal_tunables.kib_io_timeout > n * p)
                                 chunk = (chunk * n * p) / 
-                                        koibnal_tunables.koib_io_timeout;
+                                        kibnal_tunables.kib_io_timeout;
                         if (chunk == 0)
                                 chunk = 1;
 
                         for (i = 0; i < chunk; i++) {
-                                koibnal_check_conns (peer_index);
+                                kibnal_check_conns (peer_index);
                                 peer_index = (peer_index + 1) % 
-                                             koibnal_data.koib_peer_hash_size;
+                                             kibnal_data.kib_peer_hash_size;
                         }
 
                         deadline += p * HZ;
                 }
 
-                koibnal_data.koib_connd_waketime = jiffies + timeout;
+                kibnal_data.kib_connd_waketime = jiffies + timeout;
 
                 set_current_state (TASK_INTERRUPTIBLE);
-                add_wait_queue (&koibnal_data.koib_connd_waitq, &wait);
+                add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
 
-                if (!koibnal_data.koib_shutdown &&
-                    list_empty (&koibnal_data.koib_connd_conns) &&
-                    list_empty (&koibnal_data.koib_connd_peers))
+                if (!kibnal_data.kib_shutdown &&
+                    list_empty (&kibnal_data.kib_connd_conns) &&
+                    list_empty (&kibnal_data.kib_connd_peers))
                         schedule_timeout (timeout);
 
                 set_current_state (TASK_RUNNING);
-                remove_wait_queue (&koibnal_data.koib_connd_waitq, &wait);
+                remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
 
-                spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+                spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
         }
 
-        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
-        koibnal_thread_fini ();
+        kibnal_thread_fini ();
         return (0);
 }
 
 int
-koibnal_scheduler(void *arg)
+kibnal_scheduler(void *arg)
 {
         long            id = (long)arg;
         char            name[16];
-        koib_rx_t      *rx;
-        koib_tx_t      *tx;
+        kib_rx_t       *rx;
+        kib_tx_t       *tx;
         unsigned long   flags;
         int             rc;
         int             counter = 0;
         int             did_something;
 
-        snprintf(name, sizeof(name), "koibnal_sd_%02ld", id);
+        snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
         kportal_daemonize(name);
         kportal_blockallsigs();
 
-        spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags);
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
 
         for (;;) {
                 did_something = 0;
 
-                while (!list_empty(&koibnal_data.koib_sched_txq)) {
-                        tx = list_entry(koibnal_data.koib_sched_txq.next,
-                                        koib_tx_t, tx_list);
+                while (!list_empty(&kibnal_data.kib_sched_txq)) {
+                        tx = list_entry(kibnal_data.kib_sched_txq.next,
+                                        kib_tx_t, tx_list);
                         list_del(&tx->tx_list);
-                        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
-                        koibnal_tx_done(tx);
+                        kibnal_tx_done(tx);
 
-                        spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
                                           flags);
                 }
 
-                if (!list_empty(&koibnal_data.koib_sched_rxq)) {
-                        rx = list_entry(koibnal_data.koib_sched_rxq.next,
-                                        koib_rx_t, rx_list);
+                if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+                        rx = list_entry(kibnal_data.kib_sched_rxq.next,
+                                        kib_rx_t, rx_list);
                         list_del(&rx->rx_list);
-                        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
 
-                        koibnal_rx(rx);
+                        kibnal_rx(rx);
 
                         did_something = 1;
-                        spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
                                           flags);
                 }
 
                 /* shut down and no receives to complete... */
-                if (koibnal_data.koib_shutdown &&
-                    atomic_read(&koibnal_data.koib_nconns) == 0)
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
                         break;
 
                 /* nothing to do or hogging CPU */
-                if (!did_something || counter++ == OPENIBNAL_RESCHED) {
-                        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+                if (!did_something || counter++ == IBNAL_RESCHED) {
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
                         counter = 0;
 
                         if (!did_something) {
                                 rc = wait_event_interruptible(
-                                        koibnal_data.koib_sched_waitq,
-                                        !list_empty(&koibnal_data.koib_sched_txq) || 
-                                        !list_empty(&koibnal_data.koib_sched_rxq) || 
-                                        (koibnal_data.koib_shutdown &&
-                                         atomic_read (&koibnal_data.koib_nconns) == 0));
+                                        kibnal_data.kib_sched_waitq,
+                                        !list_empty(&kibnal_data.kib_sched_txq) || 
+                                        !list_empty(&kibnal_data.kib_sched_rxq) || 
+                                        (kibnal_data.kib_shutdown &&
+                                         atomic_read (&kibnal_data.kib_nconns) == 0));
                         } else {
                                 our_cond_resched();
                         }
 
-                        spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
                                           flags);
                 }
         }
 
-        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
 
-        koibnal_thread_fini();
+        kibnal_thread_fini();
         return (0);
 }
 
 
-lib_nal_t koibnal_lib = {
-        libnal_data:        &koibnal_data,      /* NAL private data */
-        libnal_send:         koibnal_send,
-        libnal_send_pages:   koibnal_send_pages,
-        libnal_recv:         koibnal_recv,
-        libnal_recv_pages:   koibnal_recv_pages,
-        libnal_dist:         koibnal_dist
+lib_nal_t kibnal_lib = {
+        libnal_data:        &kibnal_data,      /* NAL private data */
+        libnal_send:         kibnal_send,
+        libnal_send_pages:   kibnal_send_pages,
+        libnal_recv:         kibnal_recv,
+        libnal_recv_pages:   kibnal_recv_pages,
+        libnal_dist:         kibnal_dist
 };
index 16123c2..5aff4e9 100644 (file)
@@ -40,10 +40,10 @@ kpr_nal_interface_t kqswnal_router_interface = {
 #define QSWNAL_SYSCTL  201
 
 #define QSWNAL_SYSCTL_OPTIMIZED_GETS     1
-#define QSWNAL_SYSCTL_COPY_SMALL_FWD     2
+#define QSWNAL_SYSCTL_OPTIMIZED_PUTS     2
 
 static ctl_table kqswnal_ctl_table[] = {
-       {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_puts",
+       {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts",
         &kqswnal_tunables.kqn_optimized_puts, sizeof (int),
         0644, NULL, &proc_dointvec},
        {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets",
@@ -121,6 +121,8 @@ static void
 kqswnal_shutdown(nal_t *nal)
 {
        unsigned long flags;
+       kqswnal_tx_t *ktx;
+       kqswnal_rx_t *krx;
        int           do_lib_fini = 0;
 
        /* NB The first ref was this module! */
@@ -267,37 +269,25 @@ kqswnal_shutdown(nal_t *nal)
         * ep_dvma_release() get fixed (and releases any mappings in the
         * region), we can delete all the code from here -------->  */
 
-       if (kqswnal_data.kqn_txds != NULL) {
-               int  i;
+       for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) {
+               /* If ktx has a buffer, it got mapped; unmap now.  NB only
+                * the pre-mapped stuff is still mapped since all tx descs
+                * must be idle */
 
-               for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) {
-                       kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
-
-                       /* If ktx has a buffer, it got mapped; unmap now.
-                        * NB only the pre-mapped stuff is still mapped
-                        * since all tx descs must be idle */
-
-                       if (ktx->ktx_buffer != NULL)
-                               ep_dvma_unload(kqswnal_data.kqn_ep,
-                                              kqswnal_data.kqn_ep_tx_nmh,
-                                              &ktx->ktx_ebuffer);
-               }
+               if (ktx->ktx_buffer != NULL)
+                       ep_dvma_unload(kqswnal_data.kqn_ep,
+                                      kqswnal_data.kqn_ep_tx_nmh,
+                                      &ktx->ktx_ebuffer);
        }
 
-       if (kqswnal_data.kqn_rxds != NULL) {
-               int   i;
-
-               for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) {
-                       kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
-
-                       /* If krx_kiov[0].kiov_page got allocated, it got mapped.  
-                        * NB subsequent pages get merged */
+       for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
+               /* If krx_kiov[0].kiov_page got allocated, it got mapped.  
+                * NB subsequent pages get merged */
 
-                       if (krx->krx_kiov[0].kiov_page != NULL)
-                               ep_dvma_unload(kqswnal_data.kqn_ep,
-                                              kqswnal_data.kqn_ep_rx_nmh,
-                                              &krx->krx_elanbuffer);
-               }
+               if (krx->krx_kiov[0].kiov_page != NULL)
+                       ep_dvma_unload(kqswnal_data.kqn_ep,
+                                      kqswnal_data.kqn_ep_rx_nmh,
+                                      &krx->krx_elanbuffer);
        }
        /* <----------- to here */
 
@@ -330,41 +320,26 @@ kqswnal_shutdown(nal_t *nal)
        }
 #endif
 
-       if (kqswnal_data.kqn_txds != NULL)
-       {
-               int   i;
+       while (kqswnal_data.kqn_txds != NULL) {
+               ktx = kqswnal_data.kqn_txds;
 
-               for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++)
-               {
-                       kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
-
-                       if (ktx->ktx_buffer != NULL)
-                               PORTAL_FREE(ktx->ktx_buffer,
-                                           KQSW_TX_BUFFER_SIZE);
-               }
+               if (ktx->ktx_buffer != NULL)
+                       PORTAL_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
 
-               PORTAL_FREE(kqswnal_data.kqn_txds,
-                           sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS +
-                                                    KQSW_NNBLK_TXMSGS));
+               kqswnal_data.kqn_txds = ktx->ktx_alloclist;
+               PORTAL_FREE(ktx, sizeof(*ktx));
        }
 
-       if (kqswnal_data.kqn_rxds != NULL)
-       {
-               int   i;
-               int   j;
+       while (kqswnal_data.kqn_rxds != NULL) {
+               int           i;
 
-               for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
-               {
-                       kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+               krx = kqswnal_data.kqn_rxds;
+               for (i = 0; i < krx->krx_npages; i++)
+                       if (krx->krx_kiov[i].kiov_page != NULL)
+                               __free_page (krx->krx_kiov[i].kiov_page);
 
-                       for (j = 0; j < krx->krx_npages; j++)
-                               if (krx->krx_kiov[j].kiov_page != NULL)
-                                       __free_page (krx->krx_kiov[j].kiov_page);
-               }
-
-               PORTAL_FREE(kqswnal_data.kqn_rxds,
-                           sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL +
-                                                   KQSW_NRXMSGS_LARGE));
+               kqswnal_data.kqn_rxds = krx->krx_alloclist;
+               PORTAL_FREE(krx, sizeof (*krx));
        }
 
        /* resets flags, pointers to NULL etc */
@@ -388,6 +363,8 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
 #endif
        int               rc;
        int               i;
+       kqswnal_rx_t     *krx;
+       kqswnal_tx_t     *ktx;
        int               elan_page_idx;
        ptl_process_id_t  my_process_id;
        int               pkmem = atomic_read(&portal_kmemory);
@@ -560,23 +537,22 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
        /**********************************************************************/
        /* Allocate/Initialise transmit descriptors */
 
-       PORTAL_ALLOC(kqswnal_data.kqn_txds,
-                    sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
-       if (kqswnal_data.kqn_txds == NULL)
-       {
-               kqswnal_shutdown (nal);
-               return (PTL_NO_SPACE);
-       }
-
-       /* clear flags, null pointers etc */
-       memset(kqswnal_data.kqn_txds, 0,
-              sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       kqswnal_data.kqn_txds = NULL;
        for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++)
        {
                int           premapped_pages;
-               kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
                int           basepage = i * KQSW_NTXMSGPAGES;
 
+               PORTAL_ALLOC (ktx, sizeof(*ktx));
+               if (ktx == NULL) {
+                       kqswnal_shutdown (nal);
+                       return (PTL_NO_SPACE);
+               }
+
+               memset(ktx, 0, sizeof(*ktx));   /* NULL pointers; zero flags */
+               ktx->ktx_alloclist = kqswnal_data.kqn_txds;
+               kqswnal_data.kqn_txds = ktx;
+
                PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
                if (ktx->ktx_buffer == NULL)
                {
@@ -615,18 +591,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
 
        /**********************************************************************/
        /* Allocate/Initialise receive descriptors */
-
-       PORTAL_ALLOC (kqswnal_data.kqn_rxds,
-                     sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE));
-       if (kqswnal_data.kqn_rxds == NULL)
-       {
-               kqswnal_shutdown (nal);
-               return (PTL_NO_SPACE);
-       }
-
-       memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */
-              sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE));
-
+       kqswnal_data.kqn_rxds = NULL;
        elan_page_idx = 0;
        for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
        {
@@ -636,7 +601,16 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
                E3_Addr       elanbuffer;
 #endif
                int           j;
-               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               PORTAL_ALLOC(krx, sizeof(*krx));
+               if (krx == NULL) {
+                       kqswnal_shutdown(nal);
+                       return (PTL_NO_SPACE);
+               }
+
+               memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
+               krx->krx_alloclist = kqswnal_data.kqn_rxds;
+               kqswnal_data.kqn_rxds = krx;
 
                if (i < KQSW_NRXMSGS_SMALL)
                {
@@ -717,10 +691,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
        /**********************************************************************/
        /* Queue receives, now that it's OK to run their completion callbacks */
 
-       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
-       {
-               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
-
+       for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
                /* NB this enqueue can allocate/sleep (attr == 0) */
                krx->krx_state = KRX_POSTED;
 #if MULTIRAIL_EKC
index 438edc6..b08d710 100644 (file)
@@ -99,10 +99,10 @@ typedef unsigned long kqsw_csum_t;
 #define KQSW_TX_MAXCONTIG               (1<<10) /* largest payload that gets made contiguous on transmit */
 
 #define KQSW_NTXMSGS                    8       /* # normal transmit messages */
-#define KQSW_NNBLK_TXMSGS               256     /* # reserved transmit messages if can't block */
+#define KQSW_NNBLK_TXMSGS               512     /* # reserved transmit messages if can't block */
 
 #define KQSW_NRXMSGS_LARGE              64      /* # large receive buffers */
-#define KQSW_EP_ENVELOPES_LARGE         128     /* # large ep envelopes */
+#define KQSW_EP_ENVELOPES_LARGE         256     /* # large ep envelopes */
 
 #define KQSW_NRXMSGS_SMALL              256     /* # small receive buffers */
 #define KQSW_EP_ENVELOPES_SMALL         2048    /* # small ep envelopes */
@@ -144,9 +144,10 @@ typedef struct
 #endif
 } kqswnal_remotemd_t;
 
-typedef struct 
+typedef struct kqswnal_rx
 {
         struct list_head krx_list;              /* enqueue -> thread */
+        struct kqswnal_rx *krx_alloclist;       /* stack in kqn_rxds */
         EP_RCVR         *krx_eprx;              /* port to post receives to */
         EP_RXD          *krx_rxd;               /* receive descriptor (for repost) */
 #if MULTIRAIL_EKC
@@ -169,10 +170,11 @@ typedef struct
 #define KRX_COMPLETING   3                      /* waiting to be completed */
 
 
-typedef struct
+typedef struct kqswnal_tx
 {
         struct list_head  ktx_list;             /* enqueue idle/active */
         struct list_head  ktx_delayed_list;     /* enqueue delayedtxds */
+        struct kqswnal_tx *ktx_alloclist;       /* stack in kqn_txds */
         unsigned int      ktx_isnblk:1;         /* reserved descriptor? */
         unsigned int      ktx_state:7;          /* What I'm doing */
         unsigned int      ktx_firsttmpfrag:1;   /* ktx_frags[0] is in my ebuffer ? 0 : 1 */
@@ -222,8 +224,8 @@ typedef struct
         char               kqn_shuttingdown;    /* I'm trying to shut down */
         atomic_t           kqn_nthreads;        /* # threads running */
 
-        kqswnal_rx_t      *kqn_rxds;            /* all the receive descriptors */
-        kqswnal_tx_t      *kqn_txds;            /* all the transmit descriptors */
+        kqswnal_rx_t      *kqn_rxds;            /* stack of all the receive descriptors */
+        kqswnal_tx_t      *kqn_txds;            /* stack of all the transmit descriptors */
 
         struct list_head   kqn_idletxds;        /* transmit descriptors free to use */
         struct list_head   kqn_nblk_idletxds;   /* reserved free transmit descriptors */
index 75188e9..e77bd8e 100644 (file)
@@ -205,7 +205,7 @@ static int kscimacnal_startup(nal_t *nal, ptl_pid_t requested_pid,
         }
         kscimacnal_data.ksci_nid = (ptl_nid_t)(ntohl(mac_physaddr));
 
-        process_id.pid = requested_pid;
+        process_id.pid = 0;
         process_id.nid = kscimacnal_data.ksci_nid;
 
         CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n",
index 2a0ef11..7642770 100644 (file)
@@ -1226,9 +1226,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                             conn2->ksnc_type != conn->ksnc_type ||
                             conn2->ksnc_incarnation != incarnation)
                                 continue;
-                        
+
                         CWARN("Not creating duplicate connection to "
-                              "%u.%u.%u.%u type %d\n", 
+                              "%u.%u.%u.%u type %d\n",
                               HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type);
                         rc = -EALREADY;
                         goto failed_2;
@@ -1260,6 +1260,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                 break;
         }
 
+        /* Give conn a ref on sock->file since we're going to return success */
+        get_file(sock->file);
+
         conn->ksnc_peer = peer;                 /* conn takes my ref on peer */
         conn->ksnc_incarnation = incarnation;
         peer->ksnp_last_alive = jiffies;
@@ -1311,9 +1314,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                 ksocknal_putconnsock(conn);
         }
 
-        CWARN("New conn nid:"LPX64" [type:%d] %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+        CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d"
               " incarnation:"LPX64" sched[%d]/%d\n",
-              nid, conn->ksnc_type, HIPQUAD(conn->ksnc_myipaddr), 
+              nid, HIPQUAD(conn->ksnc_myipaddr), 
               HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation,
               (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
 
@@ -2054,8 +2057,7 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private)
                         rc = -EINVAL;
                         break;
                 }
-                if (rc != 0)
-                        fput (sock->file);
+                fput (sock->file);
                 break;
         }
         case NAL_CMD_CLOSE_CONNECTION: {
index 0a5266a..b8bbefd 100644 (file)
@@ -66,9 +66,7 @@
 #include <portals/lib-p30.h>
 #include <portals/nal.h>
 #include <portals/socknal.h>
-#include <linux/lustre_idl.h>
 
-#include <linux/lustre_idl.h>
 #define SOCKNAL_N_AUTOCONNECTD  4               /* # socknal autoconnect daemons */
 
 #define SOCKNAL_MIN_RECONNECT_INTERVAL HZ      /* first failed connection retry... */
index b22d501..762133e 100644 (file)
@@ -2324,17 +2324,34 @@ ksocknal_setup_sock (struct socket *sock)
         return (0);
 }
 
-int
-ksocknal_connect_peer (ksock_route_t *route, int type)
+static int
+ksocknal_connect_sock(struct socket **sockp, int *may_retry, 
+                      ksock_route_t *route, int local_port)
 {
-        struct sockaddr_in  ipaddr;
-        mm_segment_t        oldmm = get_fs();
-        struct timeval      tv;
-        int                 fd;
+        struct sockaddr_in  locaddr;
+        struct sockaddr_in  srvaddr;
         struct socket      *sock;
         int                 rc;
-        
+        int                 option;
+        mm_segment_t        oldmm = get_fs();
+        struct timeval      tv;
+
+        memset(&locaddr, 0, sizeof(locaddr)); 
+        locaddr.sin_family = AF_INET; 
+        locaddr.sin_port = htons(local_port);
+        locaddr.sin_addr.s_addr = 
+                (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) 
+                                            : INADDR_ANY;
+        memset (&srvaddr, 0, sizeof (srvaddr));
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons (route->ksnr_port);
+        srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
+
+        *may_retry = 0;
+
         rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+        *sockp = sock;
         if (rc != 0) {
                 CERROR ("Can't create autoconnect socket: %d\n", rc);
                 return (rc);
@@ -2344,17 +2361,23 @@ ksocknal_connect_peer (ksock_route_t *route, int type)
          * from userspace.  And we actually need the sock->file refcounting
          * that this gives you :) */
 
-        fd = sock_map_fd (sock);
-        if (fd < 0) {
+        rc = sock_map_fd (sock);
+        if (rc < 0) {
                 sock_release (sock);
-                CERROR ("sock_map_fd error %d\n", fd);
-                return (fd);
+                CERROR ("sock_map_fd error %d\n", rc);
+                return (rc);
         }
 
-        /* NB the fd now owns the ref on sock->file */
+        /* NB the file descriptor (rc) now owns the ref on sock->file */
         LASSERT (sock->file != NULL);
         LASSERT (file_count(sock->file) == 1);
 
+        get_file(sock->file);                /* extra ref makes sock->file */
+        sys_close(rc);                       /* survive this close */
+
+        /* Still got a single ref on sock->file */
+        LASSERT (file_count(sock->file) == 1);
+
         /* Set the socket timeouts, so our connection attempt completes in
          * finite time */
         tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
@@ -2367,7 +2390,7 @@ ksocknal_connect_peer (ksock_route_t *route, int type)
         if (rc != 0) {
                 CERROR ("Can't set send timeout %d: %d\n", 
                         ksocknal_tunables.ksnd_io_timeout, rc);
-                goto out;
+                goto failed;
         }
         
         set_fs (KERNEL_DS);
@@ -2377,53 +2400,83 @@ ksocknal_connect_peer (ksock_route_t *route, int type)
         if (rc != 0) {
                 CERROR ("Can't set receive timeout %d: %d\n",
                         ksocknal_tunables.ksnd_io_timeout, rc);
-                goto out;
+                goto failed;
         }
 
-        if (route->ksnr_myipaddr != 0) {
-                /* Bind to the local IP address */
-                memset (&ipaddr, 0, sizeof (ipaddr));
-                ipaddr.sin_family = AF_INET;
-                ipaddr.sin_port = htons (0); /* ANY */
-                ipaddr.sin_addr.s_addr = htonl(route->ksnr_myipaddr);
+        set_fs (KERNEL_DS);
+        option = 1;
+        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, 
+                             (char *)&option, sizeof (option)); 
+        set_fs (oldmm);
+        if (rc != 0) {
+                CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+                goto failed;
+        }
 
-                rc = sock->ops->bind (sock, (struct sockaddr *)&ipaddr,
-                                      sizeof (ipaddr));
-                if (rc != 0) {
-                        CERROR ("Can't bind to local IP %u.%u.%u.%u: %d\n",
-                                HIPQUAD(route->ksnr_myipaddr), rc);
-                        goto out;
-                }
+        rc = sock->ops->bind(sock, 
+                             (struct sockaddr *)&locaddr, sizeof(locaddr));
+        if (rc == -EADDRINUSE) {
+                CDEBUG(D_NET, "Port %d already in use\n", local_port);
+                *may_retry = 1;
+                goto failed;
         }
-        
-        memset (&ipaddr, 0, sizeof (ipaddr));
-        ipaddr.sin_family = AF_INET;
-        ipaddr.sin_port = htons (route->ksnr_port);
-        ipaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
-        
-        rc = sock->ops->connect (sock, (struct sockaddr *)&ipaddr, 
-                                 sizeof (ipaddr), sock->file->f_flags);
         if (rc != 0) {
-                CERROR ("Can't connect to nid "LPX64
-                        " local IP: %u.%u.%u.%u,"
-                        " remote IP: %u.%u.%u.%u/%d: %d\n", 
-                        route->ksnr_peer->ksnp_nid,
-                        HIPQUAD(route->ksnr_myipaddr),
-                        HIPQUAD(route->ksnr_ipaddr),
-                        route->ksnr_port, rc);
-                goto out;
+                CERROR("Error trying to bind to reserved port %d: %d\n",
+                       local_port, rc);
+                goto failed;
         }
 
-        rc = ksocknal_create_conn (route, sock, type);
-        if (rc == 0) {
-                /* Take an extra ref on sock->file to compensate for the
-                 * upcoming close which will lose fd's ref on it. */
-                get_file (sock->file);
+        rc = sock->ops->connect(sock,
+                                (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+                                sock->file->f_flags);
+        if (rc == 0)
+                return 0;
+
+        /* EADDRNOTAVAIL probably means we're already connected to the same
+         * peer/port on the same local port on a differently typed
+         * connection.  Let our caller retry with a different local
+         * port... */
+        *may_retry = (rc == -EADDRNOTAVAIL);
+
+        CDEBUG(*may_retry ? D_NET : D_ERROR,
+               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+               HIPQUAD(route->ksnr_myipaddr), local_port,
+               HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
+
+ failed:
+        fput(sock->file);
+        return rc;
+}
+
+int
+ksocknal_connect_peer (ksock_route_t *route, int type)
+{
+        struct socket      *sock;
+        int                 rc;
+        int                 port;
+        int                 may_retry;
+        
+        /* Iterate through reserved ports.  When typed connections are
+         * used, we will need to bind to multiple ports, but we only know
+         * this at connect time.  But, by that time we've already called
+         * bind() so we need a new socket. */
+
+        for (port = 1023; port > 512; --port) {
+
+                rc = ksocknal_connect_sock(&sock, &may_retry, route, port);
+
+                if (rc == 0) {
+                        rc = ksocknal_create_conn(route, sock, type);
+                        fput(sock->file);
+                        return rc;
+                }
+
+                if (!may_retry)
+                        return rc;
         }
 
- out:
-        sys_close (fd);
-        return (rc);
+        CERROR("Out of ports trying to bind to a reserved port\n");
+        return (-EADDRINUSE);
 }
 
 void
@@ -2443,7 +2496,6 @@ ksocknal_autoconnect (ksock_route_t *route)
                 LASSERT (type < SOCKNAL_CONN_NTYPES);
 
                 rc = ksocknal_connect_peer (route, type);
-
                 if (rc != 0)
                         break;
                 
index c56f76f..f571958 100644 (file)
@@ -60,7 +60,7 @@
 #endif
 
 unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL |
-                                            S_GMNAL | S_OPENIBNAL);
+                                            S_GMNAL | S_IBNAL);
 EXPORT_SYMBOL(portal_subsystem_debug);
 
 unsigned int portal_debug = (D_WARNING | D_DLMTRACE | D_ERROR | D_EMERG | D_HA |
@@ -97,6 +97,7 @@ int portals_do_debug_dumplog(void *arg)
 
         snprintf(debug_file_name, sizeof(debug_file_path) - 1,
                  "%s.%ld.%ld", debug_file_path, CURRENT_SECONDS, (long)arg);
+        printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name);
         tracefile_dump_all_pages(debug_file_name);
 
         current->journal_info = journal_info;
@@ -180,7 +181,7 @@ int portals_debug_clear_buffer(void)
 int portals_debug_mark_buffer(char *text)
 {
         CDEBUG(D_TRACE,"***************************************************\n");
-        CWARN("DEBUG MARKER: %s\n", text);
+        CDEBUG(D_WARNING, "DEBUG MARKER: %s\n", text);
         CDEBUG(D_TRACE,"***************************************************\n");
 
         return 0;
@@ -251,62 +252,46 @@ void portals_run_lbug_upcall(char *file, const char *fn, const int line)
 char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
 {
         if (nid == PTL_NID_ANY) {
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%s",
-                         "PTL_NID_ANY");
+                snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY");
                 return str;
         }
 
         switch(nal){
 /* XXX this could be a nal method of some sort, 'cept it's config
  * dependent whether (say) socknal NIDs are actually IP addresses... */
-#ifndef CRAY_PORTALS 
+#if !CRAY_PORTALS 
         case TCPNAL:
                 /* userspace NAL */
+        case IIBNAL:
         case OPENIBNAL:
         case SOCKNAL:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u",
+                snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u",
                          (__u32)(nid >> 32), HIPQUAD(nid));
                 break;
         case QSWNAL:
         case GMNAL:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u",
+                snprintf(str, PTL_NALFMT_SIZE, "%u:%u",
                          (__u32)(nid >> 32), (__u32)nid);
                 break;
 #endif
         default:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx",
+                snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx",
                          nal, (long long)nid);
                 break;
         }
         return str;
 }
-/*      bug #4615       */
+
 char *portals_id2str(int nal, ptl_process_id_t id, char *str)
 {
-        switch(nal){
-#ifndef CRAY_PORTALS
-        case TCPNAL:
-                /* userspace NAL */
-        case OPENIBNAL:
-        case SOCKNAL:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u,%u",
-                         (__u32)(id.nid >> 32), HIPQUAD((id.nid)) , id.pid);
-                break;
-        case QSWNAL:
-        case GMNAL:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u,%u",
-                         (__u32)(id.nid >> 32), (__u32)id.nid, id.pid);
-                break;
-#endif
-        default:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx,%lx",
-                         nal, (long long)id.nid, (long)id.pid );
-                break;
-        }
+        int   len;
+        
+        portals_nid2str(nal, id.nid, str);
+        len = strlen(str);
+        snprintf(str + len, PTL_NALFMT_SIZE, "-%u", id.pid);
         return str;
 }
 
-
 #ifdef __KERNEL__
 char stack_backtrace[LUSTRE_TRACE_SIZE];
 spinlock_t stack_backtrace_lock = SPIN_LOCK_UNLOCKED;
index 3703013..a2422e3 100644 (file)
@@ -327,6 +327,8 @@ libcfs_nal_cmd(struct portals_cfg *pcfg)
                 CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, 
                        pcfg->pcfg_command);
                 rc = cmd->nch_handler(pcfg, cmd->nch_private);
+        } else {
+                CERROR("invalid nal: %d, cmd: %d\n", nal, pcfg->pcfg_command);
         }
         up(&nal_cmd_sem);
 
@@ -413,15 +415,15 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
                 portals_debug_mark_buffer(data->ioc_inlbuf1);
                 RETURN(0);
 #if LWT_SUPPORT
-        case IOC_PORTAL_LWT_CONTROL: 
+        case IOC_PORTAL_LWT_CONTROL:
                 err = lwt_control (data->ioc_flags, data->ioc_misc);
                 break;
-                
+
         case IOC_PORTAL_LWT_SNAPSHOT: {
                 cycles_t   now;
                 int        ncpu;
                 int        total_size;
-                
+
                 err = lwt_snapshot (&now, &ncpu, &total_size,
                                     data->ioc_pbuf1, data->ioc_plen1);
                 data->ioc_nid = now;
@@ -429,15 +431,15 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
                 data->ioc_misc = total_size;
 
                 /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */
-                data->ioc_nid = sizeof(lwt_event_t);
-                data->ioc_nid2 = offsetof(lwt_event_t, lwte_where);
+                data->ioc_nid2 = sizeof(lwt_event_t);
+                data->ioc_nid3 = offsetof(lwt_event_t, lwte_where);
 
                 if (err == 0 &&
                     copy_to_user((char *)arg, data, sizeof (*data)))
                         err = -EFAULT;
                 break;
         }
-                
+
         case IOC_PORTAL_LWT_LOOKUP_STRING:
                 err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
                                          data->ioc_pbuf2, data->ioc_plen2);
@@ -456,7 +458,7 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
                         break;
                 }
 
-                if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, 
+                if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1,
                                    sizeof(pcfg))) {
                         err = -EFAULT;
                         break;
@@ -467,7 +469,7 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
                 err = libcfs_nal_cmd(&pcfg);
 
                 if (err == 0 &&
-                    copy_to_user((char *)data->ioc_pbuf1, &pcfg, 
+                    copy_to_user((char *)data->ioc_pbuf1, &pcfg,
                                  sizeof (pcfg)))
                         err = -EFAULT;
                 break;
index 562abcf..5759316 100644 (file)
@@ -38,7 +38,6 @@
 
 #include <linux/kp30.h>
 #include <linux/portals_compat25.h>
-#include <linux/lustre_compat25.h>
 #include <linux/libcfs.h>
 
 #define TCD_MAX_PAGES 1280
@@ -190,7 +189,7 @@ static void print_to_console(struct ptldebug_header *hdr, int mask, char *buf,
                 prefix = "Lustre";
                 ptype = KERN_INFO;
         }
-        
+
         printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid,
                hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
 }
@@ -455,7 +454,7 @@ int tracefile_dump_all_pages(char *filename)
         if (IS_ERR(filp)) {
                 rc = PTR_ERR(filp);
                 printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
-                      filename, rc);
+                       filename, rc);
                 goto out;
         }
 
@@ -773,6 +772,7 @@ int trace_write_debug_size(struct file *file, const char *buffer,
                        "(%lu).\n", max * smp_num_cpus, num_physpages / 5 * 4);
                 return count;
         }
+
         for (i = 0; i < NR_CPUS; i++) {
                 struct trace_cpu_data *tcd;
                 tcd = &trace_data[i].tcd;
index 13451d9..d584f1c 100644 (file)
@@ -83,7 +83,8 @@ lib_match_md(lib_nal_t *nal, int index, int op_mask,
                     me->match_id.nid != src_nid)
                         continue;
                 
-                CDEBUG(D_NET,"match_id.pid [%x], src_pid [%x]\n", me->match_id.pid, src_pid);
+                CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n",
+                       me->match_id.pid, src_pid);
 
                 if (me->match_id.pid != PTL_PID_ANY &&
                     me->match_id.pid != src_pid)
index eb41dfd..61ef372 100644 (file)
@@ -83,7 +83,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data,
 
                 CDEBUG (D_IOCTL, "Getting nid for nal [%d]\n", data->ioc_nal);
 
-                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih);
+                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
+                                NULL, &nih);
                 if (!(err == PTL_OK || err == PTL_IFACE_DUP))
                         RETURN (-EINVAL);
 
@@ -104,7 +105,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data,
                 CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
                         data->ioc_nal, data->ioc_nid, data->ioc_count);
 
-                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih);
+                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
+                                NULL, &nih);
                 if (!(err == PTL_OK || err == PTL_IFACE_DUP))
                         return (-EINVAL);
 
index 0fe3b90..a1397d2 100644 (file)
@@ -132,7 +132,7 @@ static int kpr_proc_routes_read(char *page, char **start, off_t off,
         *start = page + prd->skip;
         user_len = -prd->skip;
 
-        for (; prd->curr != &kpr_routes; prd->curr = prd->curr->next) {
+        while ((prd->curr != NULL) && (prd->curr != &kpr_routes)) {
                 re = list_entry(prd->curr, kpr_route_entry_t, kpre_list);
                 ge = re->kpre_gateway;
 
@@ -144,11 +144,20 @@ static int kpr_proc_routes_read(char *page, char **start, off_t off,
                 chunk_len += line_len;
                 user_len += line_len;
 
-                /* The route table will exceed one page */
-                if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count)) {
-                        prd->curr = prd->curr->next;
-                        break;
+                /* Abort the route list changed */
+                if (prd->curr->next == NULL) {
+                        prd->curr = NULL;
+                        read_unlock(&kpr_rwlock);
+                        return sprintf(page, "\nError: Routes Changed\n");
                 }
+
+                prd->curr = prd->curr->next;
+
+                /* The route table will exceed one page, break the while loop
+                 * so the function can be re-called with a new page.
+                 */
+                if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count))
+                        break;
         }
 
         *eof = 0;
index ed8dc08..b399fcf 100644 (file)
@@ -331,10 +331,17 @@ connection force_tcp_connection(manager m,
 {
     connection conn;
     struct sockaddr_in addr;
+    struct sockaddr_in locaddr; 
     unsigned int id[2];
     struct timeval tv;
     __u64 incarnation;
 
+    int fd;
+    int option;
+    int rc;
+    int rport;
+    ptl_nid_t peernid = PTL_NID_ANY;
+
     port = tcpnal_acceptor_port;
 
     id[0] = ip;
@@ -343,49 +350,82 @@ connection force_tcp_connection(manager m,
     pthread_mutex_lock(&m->conn_lock);
 
     conn = hash_table_find(m->connections, id);
-    if (!conn) {
-        int fd;
-        int option;
-        ptl_nid_t peernid = PTL_NID_ANY;
-
-        bzero((char *) &addr, sizeof(addr));
-        addr.sin_family      = AF_INET;
-        addr.sin_addr.s_addr = htonl(ip);
-        addr.sin_port        = htons(port);
-
-        if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 
-            perror("tcpnal socket failed");
-            exit(-1);
-        }
-        if (connect(fd, (struct sockaddr *)&addr,
-                    sizeof(struct sockaddr_in))) {
-            perror("tcpnal connect");
-            return(0);
-        }
+    if (conn)
+            goto out;
 
+    memset(&addr, 0, sizeof(addr));
+    addr.sin_family      = AF_INET;
+    addr.sin_addr.s_addr = htonl(ip);
+    addr.sin_port        = htons(port);
+
+    memset(&locaddr, 0, sizeof(locaddr)); 
+    locaddr.sin_family = AF_INET; 
+    locaddr.sin_addr.s_addr = INADDR_ANY;
+
+    for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
+            fd = socket(AF_INET, SOCK_STREAM, 0);
+            if (fd < 0) {
+                    perror("tcpnal socket failed");
+                    goto out;
+            } 
+            
+            option = 1;
+            rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 
+                            &option, sizeof(option));
+            if (rc != 0) {
+                    perror ("Can't set SO_REUSEADDR for socket"); 
+                    close(fd);
+                    goto out;
+            } 
+
+            locaddr.sin_port = htons(rport);
+            rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));
+            if (rc == 0 || errno == EACCES) {
+                    rc = connect(fd, (struct sockaddr *)&addr,
+                                 sizeof(struct sockaddr_in));
+                    if (rc == 0) {
+                            break;
+                    } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) {
+                            perror("Error connecting to remote host");
+                            close(fd);
+                            goto out;
+                    }
+            } else if (errno != EADDRINUSE) {
+                    perror("Error binding to privileged port");
+                    close(fd);
+                    goto out;
+            }
+            close(fd);
+    }
+    
+    if (rport == IPPORT_RESERVED / 2) {
+            fprintf(stderr, "Out of ports trying to bind to a reserved port\n");
+            goto out;
+    }
+    
 #if 1
-        option = 1;
-        setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
-        option = 1<<20;
-        setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
-        option = 1<<20;
-        setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
+    option = 1;
+    setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
+    option = 1<<20;
+    setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
+    option = 1<<20;
+    setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
 #endif
    
-        gettimeofday(&tv, NULL);
-        incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+    gettimeofday(&tv, NULL);
+    incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
 
-        /* say hello */
-        if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
+    /* say hello */
+    if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
             exit(-1);
+    
+    conn = allocate_connection(m, ip, port, fd);
+    
+    /* let nal thread know this event right away */
+    if (conn)
+            procbridge_wakeup_nal(pb);
 
-        conn = allocate_connection(m, ip, port, fd);
-
-        /* let nal thread know this event right away */
-        if (conn)
-                procbridge_wakeup_nal(pb);
-    }
-
+out:
     pthread_mutex_unlock(&m->conn_lock);
     return (conn);
 }
index 34dd070..a8f916d 100644 (file)
@@ -37,3 +37,10 @@ void remove_io_handler (io_handler i);
 void init_unix_timer(void);
 void select_timer_block(when until);
 when now(void);
+
+/*
+ * hacking for CFS internal MPI testing
+ */ 
+#if !CRAY_PORTALS
+#define ENABLE_SELECT_DISPATCH
+#endif
index f3843d7..6b471c0 100644 (file)
@@ -107,6 +107,10 @@ nal_t procapi_nal = {
 
 ptl_nid_t tcpnal_mynid;
 
+#ifdef ENABLE_SELECT_DISPATCH
+procbridge __global_procbridge = NULL;
+#endif
+
 /* Function: procbridge_startup
  *
  * Arguments:  pid: requested process id (port offset)
@@ -163,6 +167,10 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid,
         return PTL_FAIL;
     }
 
+#ifdef ENABLE_SELECT_DISPATCH
+    __global_procbridge = p;
+#endif
+
     /* create nal thread */
     if (pthread_create(&p->t, NULL, nal_thread, &args)) {
         perror("nal_init: pthread_create");
index c4ccae1..09e1542 100644 (file)
 #include <sys/time.h>
 #include <sys/types.h>
 #include <stdlib.h>
+#include <syscall.h>
+#include <pthread.h>
+#include <errno.h>
 #include <pqtimer.h>
 #include <dispatch.h>
+#include <procbridge.h>
 
 
 static struct timeval beginning_of_epoch;
@@ -95,40 +99,22 @@ void remove_io_handler (io_handler i)
     i->disabled=1;
 }
 
-static void set_flag(io_handler n,fd_set *fds)
+static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e)
 {
-    if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]);
-    if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]);
-    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]);
+    if (n->type & READ_HANDLER) FD_SET(n->fd, r);
+    if (n->type & WRITE_HANDLER) FD_SET(n->fd, w);
+    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e);
 }
 
-
-/* Function: select_timer_block
- * Arguments: until: an absolute time when the select should return
- * 
- *   This function dispatches the various file descriptors' handler
- *   functions, if the kernel indicates there is io available.
- */
-void select_timer_block(when until)
+static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e)
 {
-    fd_set fds[3];
-    struct timeval timeout;
-    struct timeval *timeout_pointer;
-    int result;
     io_handler j;
     io_handler *k;
+    int max = 0;
 
-    /* TODO: loop until the entire interval is expired*/
-    if (until){
-       when interval=until-now();
-        timeout.tv_sec=(interval>>32);
-        timeout.tv_usec=((interval<<32)/1000000)>>32;
-        timeout_pointer=&timeout;
-    } else timeout_pointer=0;
-
-    FD_ZERO(&fds[0]);
-    FD_ZERO(&fds[1]);
-    FD_ZERO(&fds[2]);
+    FD_ZERO(r);
+    FD_ZERO(w);
+    FD_ZERO(e);
     for (k=&io_handlers;*k;){
         if ((*k)->disabled){
             j=*k;
@@ -136,24 +122,291 @@ void select_timer_block(when until)
             free(j);
         }
         if (*k) {
-           set_flag(*k,fds);
+           set_flag(*k,r,w,e);
+            if ((*k)->fd > max)
+                max = (*k)->fd;
            k=&(*k)->next;
        }
     }
+    return max + 1;
+}
+
+static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e)
+{
+    io_handler j;
+    int n = 0, t;
+
+    for (j = io_handlers; j; j = j->next) {
+        if (j->disabled)
+            continue;
+
+        t = 0;
+        if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) {
+            FD_CLR(j->fd, r);
+            t++;
+        }
+        if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) {
+            FD_CLR(j->fd, w);
+            t++;
+        }
+        if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) {
+            FD_CLR(j->fd, e);
+            t++;
+        }
+        if (t == 0)
+            continue;
+
+        if (!(*j->function)(j->argument))
+            j->disabled = 1;
+
+        n += t;
+    }
+
+    return n;
+}
 
-    result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer);
+#ifdef ENABLE_SELECT_DISPATCH
 
-    if (result > 0)
-        for (j=io_handlers;j;j=j->next){
-            if (!(j->disabled) && 
-                ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) ||
-                 (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) ||
-                 (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){
-                if (!(*j->function)(j->argument))
-                    j->disabled=1;
+static struct {
+    pthread_mutex_t mutex;
+    pthread_cond_t  cond;
+    int             submitted;
+    int             nready;
+    int             maxfd;
+    fd_set         *rset;
+    fd_set         *wset;
+    fd_set         *eset;
+    struct timeval *timeout;
+    struct timeval  submit_time;
+} fd_extra = {
+    PTHREAD_MUTEX_INITIALIZER,
+    PTHREAD_COND_INITIALIZER,
+    0, 0, 0,
+    NULL, NULL, NULL, NULL,
+};
+
+extern int liblustre_wait_event(int timeout);
+extern procbridge __global_procbridge;
+
+/*
+ * this will intercept syscall select() of user apps
+ * such as MPI libs.
+ */
+int select(int n, fd_set *rset, fd_set *wset, fd_set *eset,
+           struct timeval *timeout)
+{
+    LASSERT(fd_extra.submitted == 0);
+
+    fd_extra.nready = 0;
+    fd_extra.maxfd = n;
+    fd_extra.rset = rset;
+    fd_extra.wset = wset;
+    fd_extra.eset = eset;
+    fd_extra.timeout = timeout;
+
+    liblustre_wait_event(0);
+    pthread_mutex_lock(&fd_extra.mutex);
+    gettimeofday(&fd_extra.submit_time, NULL);
+    fd_extra.submitted = 1;
+    LASSERT(__global_procbridge);
+    procbridge_wakeup_nal(__global_procbridge);
+
+again:
+    if (fd_extra.submitted)
+        pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex);
+    pthread_mutex_unlock(&fd_extra.mutex);
+
+    liblustre_wait_event(0);
+
+    pthread_mutex_lock(&fd_extra.mutex);
+    if (fd_extra.submitted)
+        goto again;
+    pthread_mutex_unlock(&fd_extra.mutex);
+
+    LASSERT(fd_extra.nready >= 0);
+    LASSERT(fd_extra.submitted == 0);
+    return fd_extra.nready;
+}
+
+static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset)
+{
+    int i;
+
+    LASSERT(rset);
+    LASSERT(wset);
+    LASSERT(eset);
+
+    for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) {
+        LASSERT(!fd_extra.rset ||
+                !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i]));
+        LASSERT(!fd_extra.wset ||
+                !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i]));
+        LASSERT(!fd_extra.eset ||
+                !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i]));
+
+        if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i])
+            __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i];
+        if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i])
+            __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i];
+        if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i])
+            __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i];
+    }
+
+    return (fd_extra.maxfd > max ? fd_extra.maxfd : max);
+}
+
+static inline
+int timeval_ge(struct timeval *tv1, struct timeval *tv2)
+{
+    LASSERT(tv1 && tv2);
+    return ((tv1->tv_sec - tv2->tv_sec) * 1000000 +
+            (tv1->tv_usec - tv2->tv_usec) >= 0);
+}
+
+/*
+ * choose the most recent timeout value
+ */
+static struct timeval *choose_timeout(struct timeval *tv1,
+                                      struct timeval *tv2)
+{
+    if (!tv1)
+        return tv2;
+    else if (!tv2)
+        return tv1;
+
+    if (timeval_ge(tv1, tv2))
+        return tv2;
+    else
+        return tv1;
+}
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer, *select_timeout;
+    int max, nready, nexec;
+    int fd_handling;
+
+again:
+    if (until) {
+        when interval;
+
+        interval = until - now();
+        timeout.tv_sec = (interval >> 32);
+        timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+        timeout_pointer = &timeout;
+    } else
+        timeout_pointer = NULL;
+
+    fd_handling = 0;
+    max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+    select_timeout = timeout_pointer;
+
+    pthread_mutex_lock(&fd_extra.mutex);
+    fd_handling = fd_extra.submitted;
+    pthread_mutex_unlock(&fd_extra.mutex);
+    if (fd_handling) {
+        max = merge_fds(max, &fds[0], &fds[1], &fds[2]);
+        select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout);
+    }
+
+    /* XXX only compile for linux */
+#if __WORDSIZE == 64
+    nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2],
+                     select_timeout);
+#else
+    nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2],
+                     select_timeout);
+#endif
+    if (nready < 0) {
+        CERROR("select return err %d, errno %d\n", nready, errno);
+        return;
+    }
+
+    if (nready) {
+        nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]);
+        nready -= nexec;
+    } else
+        nexec = 0;
+
+    /* even both nready & nexec are 0, we still need try to wakeup
+     * upper thread since it may have timed out
+     */
+    if (fd_handling) {
+        LASSERT(nready >= 0);
+
+        pthread_mutex_lock(&fd_extra.mutex);
+        if (nready) {
+            if (fd_extra.rset)
+                *fd_extra.rset = fds[0];
+            if (fd_extra.wset)
+                *fd_extra.wset = fds[1];
+            if (fd_extra.eset)
+                *fd_extra.eset = fds[2];
+            fd_extra.nready = nready;
+            fd_extra.submitted = 0;
+        } else {
+            struct timeval t;
+
+            fd_extra.nready = 0;
+            if (fd_extra.timeout) {
+                gettimeofday(&t, NULL);
+                if (timeval_ge(&t, &fd_extra.submit_time))
+                    fd_extra.submitted = 0;
             }
         }
+
+        pthread_cond_signal(&fd_extra.cond);
+        pthread_mutex_unlock(&fd_extra.mutex);
+    }
+
+    /* haven't found portals event, go back to loop if time
+     * is not expired */
+    if (!nexec) {
+        if (timeout_pointer == NULL || now() >= until)
+            goto again;
+    }
+}
+
+#else /* !ENABLE_SELECT_DISPATCH */
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer;
+    int max, nready;
+
+again:
+    if (until) {
+        when interval;
+        interval = until - now();
+        timeout.tv_sec = (interval >> 32);
+        timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+        timeout_pointer = &timeout;
+    } else
+        timeout_pointer = NULL;
+
+    max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+
+    nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer);
+    if (nready > 0)
+        execute_callbacks(&fds[0], &fds[1], &fds[2]);
 }
+#endif /* ENABLE_SELECT_DISPATCH */
 
 /* Function: init_unix_timer()
  *   is called to initialize the library 
index ed8dc08..b399fcf 100644 (file)
@@ -331,10 +331,17 @@ connection force_tcp_connection(manager m,
 {
     connection conn;
     struct sockaddr_in addr;
+    struct sockaddr_in locaddr; 
     unsigned int id[2];
     struct timeval tv;
     __u64 incarnation;
 
+    int fd;
+    int option;
+    int rc;
+    int rport;
+    ptl_nid_t peernid = PTL_NID_ANY;
+
     port = tcpnal_acceptor_port;
 
     id[0] = ip;
@@ -343,49 +350,82 @@ connection force_tcp_connection(manager m,
     pthread_mutex_lock(&m->conn_lock);
 
     conn = hash_table_find(m->connections, id);
-    if (!conn) {
-        int fd;
-        int option;
-        ptl_nid_t peernid = PTL_NID_ANY;
-
-        bzero((char *) &addr, sizeof(addr));
-        addr.sin_family      = AF_INET;
-        addr.sin_addr.s_addr = htonl(ip);
-        addr.sin_port        = htons(port);
-
-        if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 
-            perror("tcpnal socket failed");
-            exit(-1);
-        }
-        if (connect(fd, (struct sockaddr *)&addr,
-                    sizeof(struct sockaddr_in))) {
-            perror("tcpnal connect");
-            return(0);
-        }
+    if (conn)
+            goto out;
 
+    memset(&addr, 0, sizeof(addr));
+    addr.sin_family      = AF_INET;
+    addr.sin_addr.s_addr = htonl(ip);
+    addr.sin_port        = htons(port);
+
+    memset(&locaddr, 0, sizeof(locaddr)); 
+    locaddr.sin_family = AF_INET; 
+    locaddr.sin_addr.s_addr = INADDR_ANY;
+
+    for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
+            fd = socket(AF_INET, SOCK_STREAM, 0);
+            if (fd < 0) {
+                    perror("tcpnal socket failed");
+                    goto out;
+            } 
+            
+            option = 1;
+            rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 
+                            &option, sizeof(option));
+            if (rc != 0) {
+                    perror ("Can't set SO_REUSEADDR for socket"); 
+                    close(fd);
+                    goto out;
+            } 
+
+            locaddr.sin_port = htons(rport);
+            rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));
+            if (rc == 0 || errno == EACCES) {
+                    rc = connect(fd, (struct sockaddr *)&addr,
+                                 sizeof(struct sockaddr_in));
+                    if (rc == 0) {
+                            break;
+                    } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) {
+                            perror("Error connecting to remote host");
+                            close(fd);
+                            goto out;
+                    }
+            } else if (errno != EADDRINUSE) {
+                    perror("Error binding to privileged port");
+                    close(fd);
+                    goto out;
+            }
+            close(fd);
+    }
+    
+    if (rport == IPPORT_RESERVED / 2) {
+            fprintf(stderr, "Out of ports trying to bind to a reserved port\n");
+            goto out;
+    }
+    
 #if 1
-        option = 1;
-        setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
-        option = 1<<20;
-        setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
-        option = 1<<20;
-        setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
+    option = 1;
+    setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
+    option = 1<<20;
+    setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
+    option = 1<<20;
+    setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
 #endif
    
-        gettimeofday(&tv, NULL);
-        incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+    gettimeofday(&tv, NULL);
+    incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
 
-        /* say hello */
-        if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
+    /* say hello */
+    if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
             exit(-1);
+    
+    conn = allocate_connection(m, ip, port, fd);
+    
+    /* let nal thread know this event right away */
+    if (conn)
+            procbridge_wakeup_nal(pb);
 
-        conn = allocate_connection(m, ip, port, fd);
-
-        /* let nal thread know this event right away */
-        if (conn)
-                procbridge_wakeup_nal(pb);
-    }
-
+out:
     pthread_mutex_unlock(&m->conn_lock);
     return (conn);
 }
index 34dd070..a8f916d 100644 (file)
@@ -37,3 +37,10 @@ void remove_io_handler (io_handler i);
 void init_unix_timer(void);
 void select_timer_block(when until);
 when now(void);
+
+/*
+ * hacking for CFS internal MPI testing
+ */ 
+#if !CRAY_PORTALS
+#define ENABLE_SELECT_DISPATCH
+#endif
index f3843d7..6b471c0 100644 (file)
@@ -107,6 +107,10 @@ nal_t procapi_nal = {
 
 ptl_nid_t tcpnal_mynid;
 
+#ifdef ENABLE_SELECT_DISPATCH
+procbridge __global_procbridge = NULL;
+#endif
+
 /* Function: procbridge_startup
  *
  * Arguments:  pid: requested process id (port offset)
@@ -163,6 +167,10 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid,
         return PTL_FAIL;
     }
 
+#ifdef ENABLE_SELECT_DISPATCH
+    __global_procbridge = p;
+#endif
+
     /* create nal thread */
     if (pthread_create(&p->t, NULL, nal_thread, &args)) {
         perror("nal_init: pthread_create");
index c4ccae1..09e1542 100644 (file)
 #include <sys/time.h>
 #include <sys/types.h>
 #include <stdlib.h>
+#include <syscall.h>
+#include <pthread.h>
+#include <errno.h>
 #include <pqtimer.h>
 #include <dispatch.h>
+#include <procbridge.h>
 
 
 static struct timeval beginning_of_epoch;
@@ -95,40 +99,22 @@ void remove_io_handler (io_handler i)
     i->disabled=1;
 }
 
-static void set_flag(io_handler n,fd_set *fds)
+static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e)
 {
-    if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]);
-    if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]);
-    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]);
+    if (n->type & READ_HANDLER) FD_SET(n->fd, r);
+    if (n->type & WRITE_HANDLER) FD_SET(n->fd, w);
+    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e);
 }
 
-
-/* Function: select_timer_block
- * Arguments: until: an absolute time when the select should return
- * 
- *   This function dispatches the various file descriptors' handler
- *   functions, if the kernel indicates there is io available.
- */
-void select_timer_block(when until)
+static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e)
 {
-    fd_set fds[3];
-    struct timeval timeout;
-    struct timeval *timeout_pointer;
-    int result;
     io_handler j;
     io_handler *k;
+    int max = 0;
 
-    /* TODO: loop until the entire interval is expired*/
-    if (until){
-       when interval=until-now();
-        timeout.tv_sec=(interval>>32);
-        timeout.tv_usec=((interval<<32)/1000000)>>32;
-        timeout_pointer=&timeout;
-    } else timeout_pointer=0;
-
-    FD_ZERO(&fds[0]);
-    FD_ZERO(&fds[1]);
-    FD_ZERO(&fds[2]);
+    FD_ZERO(r);
+    FD_ZERO(w);
+    FD_ZERO(e);
     for (k=&io_handlers;*k;){
         if ((*k)->disabled){
             j=*k;
@@ -136,24 +122,291 @@ void select_timer_block(when until)
             free(j);
         }
         if (*k) {
-           set_flag(*k,fds);
+           set_flag(*k,r,w,e);
+            if ((*k)->fd > max)
+                max = (*k)->fd;
            k=&(*k)->next;
        }
     }
+    return max + 1;
+}
+
+static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e)
+{
+    io_handler j;
+    int n = 0, t;
+
+    for (j = io_handlers; j; j = j->next) {
+        if (j->disabled)
+            continue;
+
+        t = 0;
+        if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) {
+            FD_CLR(j->fd, r);
+            t++;
+        }
+        if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) {
+            FD_CLR(j->fd, w);
+            t++;
+        }
+        if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) {
+            FD_CLR(j->fd, e);
+            t++;
+        }
+        if (t == 0)
+            continue;
+
+        if (!(*j->function)(j->argument))
+            j->disabled = 1;
+
+        n += t;
+    }
+
+    return n;
+}
 
-    result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer);
+#ifdef ENABLE_SELECT_DISPATCH
 
-    if (result > 0)
-        for (j=io_handlers;j;j=j->next){
-            if (!(j->disabled) && 
-                ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) ||
-                 (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) ||
-                 (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){
-                if (!(*j->function)(j->argument))
-                    j->disabled=1;
+static struct {
+    pthread_mutex_t mutex;
+    pthread_cond_t  cond;
+    int             submitted;
+    int             nready;
+    int             maxfd;
+    fd_set         *rset;
+    fd_set         *wset;
+    fd_set         *eset;
+    struct timeval *timeout;
+    struct timeval  submit_time;
+} fd_extra = {
+    PTHREAD_MUTEX_INITIALIZER,
+    PTHREAD_COND_INITIALIZER,
+    0, 0, 0,
+    NULL, NULL, NULL, NULL,
+};
+
+extern int liblustre_wait_event(int timeout);
+extern procbridge __global_procbridge;
+
+/*
+ * this will intercept syscall select() of user apps
+ * such as MPI libs.
+ */
+int select(int n, fd_set *rset, fd_set *wset, fd_set *eset,
+           struct timeval *timeout)
+{
+    LASSERT(fd_extra.submitted == 0);
+
+    fd_extra.nready = 0;
+    fd_extra.maxfd = n;
+    fd_extra.rset = rset;
+    fd_extra.wset = wset;
+    fd_extra.eset = eset;
+    fd_extra.timeout = timeout;
+
+    liblustre_wait_event(0);
+    pthread_mutex_lock(&fd_extra.mutex);
+    gettimeofday(&fd_extra.submit_time, NULL);
+    fd_extra.submitted = 1;
+    LASSERT(__global_procbridge);
+    procbridge_wakeup_nal(__global_procbridge);
+
+again:
+    if (fd_extra.submitted)
+        pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex);
+    pthread_mutex_unlock(&fd_extra.mutex);
+
+    liblustre_wait_event(0);
+
+    pthread_mutex_lock(&fd_extra.mutex);
+    if (fd_extra.submitted)
+        goto again;
+    pthread_mutex_unlock(&fd_extra.mutex);
+
+    LASSERT(fd_extra.nready >= 0);
+    LASSERT(fd_extra.submitted == 0);
+    return fd_extra.nready;
+}
+
+static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset)
+{
+    int i;
+
+    LASSERT(rset);
+    LASSERT(wset);
+    LASSERT(eset);
+
+    for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) {
+        LASSERT(!fd_extra.rset ||
+                !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i]));
+        LASSERT(!fd_extra.wset ||
+                !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i]));
+        LASSERT(!fd_extra.eset ||
+                !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i]));
+
+        if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i])
+            __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i];
+        if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i])
+            __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i];
+        if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i])
+            __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i];
+    }
+
+    return (fd_extra.maxfd > max ? fd_extra.maxfd : max);
+}
+
+static inline
+int timeval_ge(struct timeval *tv1, struct timeval *tv2)
+{
+    LASSERT(tv1 && tv2);
+    return ((tv1->tv_sec - tv2->tv_sec) * 1000000 +
+            (tv1->tv_usec - tv2->tv_usec) >= 0);
+}
+
+/*
+ * choose the most recent timeout value
+ */
+static struct timeval *choose_timeout(struct timeval *tv1,
+                                      struct timeval *tv2)
+{
+    if (!tv1)
+        return tv2;
+    else if (!tv2)
+        return tv1;
+
+    if (timeval_ge(tv1, tv2))
+        return tv2;
+    else
+        return tv1;
+}
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer, *select_timeout;
+    int max, nready, nexec;
+    int fd_handling;
+
+again:
+    if (until) {
+        when interval;
+
+        interval = until - now();
+        timeout.tv_sec = (interval >> 32);
+        timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+        timeout_pointer = &timeout;
+    } else
+        timeout_pointer = NULL;
+
+    fd_handling = 0;
+    max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+    select_timeout = timeout_pointer;
+
+    pthread_mutex_lock(&fd_extra.mutex);
+    fd_handling = fd_extra.submitted;
+    pthread_mutex_unlock(&fd_extra.mutex);
+    if (fd_handling) {
+        max = merge_fds(max, &fds[0], &fds[1], &fds[2]);
+        select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout);
+    }
+
+    /* XXX only compile for linux */
+#if __WORDSIZE == 64
+    nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2],
+                     select_timeout);
+#else
+    nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2],
+                     select_timeout);
+#endif
+    if (nready < 0) {
+        CERROR("select return err %d, errno %d\n", nready, errno);
+        return;
+    }
+
+    if (nready) {
+        nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]);
+        nready -= nexec;
+    } else
+        nexec = 0;
+
+    /* even both nready & nexec are 0, we still need try to wakeup
+     * upper thread since it may have timed out
+     */
+    if (fd_handling) {
+        LASSERT(nready >= 0);
+
+        pthread_mutex_lock(&fd_extra.mutex);
+        if (nready) {
+            if (fd_extra.rset)
+                *fd_extra.rset = fds[0];
+            if (fd_extra.wset)
+                *fd_extra.wset = fds[1];
+            if (fd_extra.eset)
+                *fd_extra.eset = fds[2];
+            fd_extra.nready = nready;
+            fd_extra.submitted = 0;
+        } else {
+            struct timeval t;
+
+            fd_extra.nready = 0;
+            if (fd_extra.timeout) {
+                gettimeofday(&t, NULL);
+                if (timeval_ge(&t, &fd_extra.submit_time))
+                    fd_extra.submitted = 0;
             }
         }
+
+        pthread_cond_signal(&fd_extra.cond);
+        pthread_mutex_unlock(&fd_extra.mutex);
+    }
+
+    /* haven't found portals event, go back to loop if time
+     * is not expired */
+    if (!nexec) {
+        if (timeout_pointer == NULL || now() >= until)
+            goto again;
+    }
+}
+
+#else /* !ENABLE_SELECT_DISPATCH */
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer;
+    int max, nready;
+
+again:
+    if (until) {
+        when interval;
+        interval = until - now();
+        timeout.tv_sec = (interval >> 32);
+        timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+        timeout_pointer = &timeout;
+    } else
+        timeout_pointer = NULL;
+
+    max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+
+    nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer);
+    if (nready > 0)
+        execute_callbacks(&fds[0], &fds[1], &fds[2]);
 }
+#endif /* ENABLE_SELECT_DISPATCH */
 
 /* Function: init_unix_timer()
  *   is called to initialize the library 
index 6e9cca9..abb6d01 100644 (file)
@@ -251,8 +251,6 @@ int tcpnal_init(bridge b)
            newly created junk */
         return(PTL_NAL_FAILED);
     }
-    /* XXX cfs hack */
-//    b->lib_nal->libnal_ni.ni_pid.pid=0;
     b->lower=m;
     return(PTL_OK);
 }
index 6e9cca9..abb6d01 100644 (file)
@@ -251,8 +251,6 @@ int tcpnal_init(bridge b)
            newly created junk */
         return(PTL_NAL_FAILED);
     }
-    /* XXX cfs hack */
-//    b->lib_nal->libnal_ni.ni_pid.pid=0;
     b->lower=m;
     return(PTL_OK);
 }
index 8aea457..524d128 100644 (file)
@@ -89,7 +89,11 @@ show_connection (int fd, __u32 net_ip)
 void
 usage (char *myname)
 {
-        fprintf (stderr, "Usage: %s [-N nal_id] port\n", myname);
+        fprintf (stderr, 
+                 "Usage: %s [-N nal_id] [-p] [-l] port\n\n"
+                 " -l\tKeep stdin/stdout open\n"
+                 " -p\tAllow connections from non-privileged ports\n",
+                 myname);
         exit (1);
 }
 
@@ -100,24 +104,27 @@ int main(int argc, char **argv)
         int c;
         int noclose = 0;
         int nal = SOCKNAL;
+        int rport;
+        int require_privports = 1;
         
-        while ((c = getopt (argc, argv, "N:l")) != -1)
-                switch (c)
-                {
-                case 'l':
-                        noclose = 1;
-                        break;
-
+        while ((c = getopt (argc, argv, "N:lp")) != -1) {
+                switch (c) {
                 case 'N':
                         if (sscanf(optarg, "%d", &nal) != 1 ||
                             nal < 0 || nal > NAL_MAX_NR)
                                 usage(argv[0]);
                         break;
-                        
+                case 'l':
+                        noclose = 1;
+                        break;
+                case 'p':
+                        require_privports = 0;
+                        break;
                 default:
                         usage (argv[0]);
                         break;
                 }
+        }
 
         if (optind >= argc)
                 usage (argv[0]);
@@ -162,7 +169,7 @@ int main(int argc, char **argv)
                 exit(1);
         }
 
-        rc = daemon(1, noclose);
+        rc = daemon(0, noclose);
         if (rc < 0) {
                 perror("daemon(): ");
                 exit(1);
@@ -180,8 +187,8 @@ int main(int argc, char **argv)
                 struct portals_cfg pcfg;
 #ifdef HAVE_LIBWRAP
                 struct request_info request;
-                char addrstr[INET_ADDRSTRLEN];
 #endif
+                char addrstr[INET_ADDRSTRLEN];
                
                 cfd = accept(fd, (struct sockaddr *)&clntaddr, &len);
                 if ( cfd < 0 ) {
@@ -203,6 +210,18 @@ int main(int argc, char **argv)
                         continue;
                 }
 #endif
+
+                if (require_privports && ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) {
+                        inet_ntop(AF_INET, &clntaddr.sin_addr,
+                                  addrstr, INET_ADDRSTRLEN);
+                        syslog(LOG_ERR, "Closing non-privileged connection from %s:%d\n",
+                               addrstr, ntohs(clntaddr.sin_port));
+                        rc = close(cfd);
+                        if (rc)
+                                perror ("close un-privileged client failed");
+                        continue;
+                }
+
                 show_connection (cfd, clntaddr.sin_addr.s_addr);
 
                 PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD);
index 36d8a04..5b65f24 100644 (file)
 #include <portals/list.h>
 
 #include <stdio.h>
+#ifdef HAVE_NETDB_H
 #include <netdb.h>
+#endif
 #include <stdlib.h>
 #include <string.h>
+#include "ioctl.h"
 #include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 
+#ifdef HAVE_LINUX_VERSION_H
 #include <linux/version.h>
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #define BUG()                            /* workaround for module.h includes */
 #include <linux/module.h>
 #endif
+#endif /* !HAVE_LINUX_VERSION_H */
+
 #include <sys/utsname.h>
 
 #include <portals/api-support.h>
@@ -62,7 +68,7 @@
 static char rawbuf[8192];
 static char *buf = rawbuf;
 static int max = 8192;
-//static int g_pfd = -1;
+/*static int g_pfd = -1;*/
 static int subsystem_mask = ~0;
 static int debug_mask = ~0;
 
@@ -72,7 +78,7 @@ static const char *portal_debug_subsystems[] =
         {"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite",
          "rpc", "mgmt", "portals", "libcfs", "socknal", "qswnal", "pinger",
          "filter", "ptlbd", "echo", "ldlm", "lov", "gmnal", "router", "cobd",
-         "openibnal", "lmv", "smfs", "cmobd", NULL};
+         "ibnal", NULL};
 static const char *portal_debug_masks[] =
         {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
          "blocks", "net", "warning", "buffs", "other", "dentry", "portals",
@@ -371,15 +377,24 @@ int jt_dbg_debug_kernel(int argc, char **argv)
                 fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]);
                 return 0;
         }
-        sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : "/tmp/lustre-log",
-                time(NULL), getpid());
 
-        if (argc > 2)
+        if (argc > 2) {
                 raw = atoi(argv[2]);
+        } else if (argc > 1 && (argv[1][0] == '0' || argv[1][0] == '1')) {
+                raw = atoi(argv[1]);
+                argc--;
+        } else {
+                sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] :
+                        "/tmp/lustre-log", time(NULL), getpid());
+        }
+
         unlink(filename);
 
         fd = open("/proc/sys/portals/dump_kernel", O_WRONLY);
         if (fd < 0) {
+                if (errno == ENOENT) /* no dump file created */
+                        return 0;
+
                 fprintf(stderr, "open(dump_kernel) failed: %s\n",
                         strerror(errno));
                 return 1;
@@ -477,25 +492,25 @@ const char debug_daemon_usage[]="usage: debug_daemon {start file [MB]|stop}\n";
 int jt_dbg_debug_daemon(int argc, char **argv)
 {
         int rc, fd;
-                                                                                                                                                                                                     
+
         if (argc <= 1) {
                 fprintf(stderr, debug_daemon_usage);
                 return 0;
         }
-                                                                                                                                                                                                     
+
         fd = open("/proc/sys/portals/daemon_file", O_WRONLY);
         if (fd < 0) {
                 fprintf(stderr, "open(daemon_file) failed: %s\n",
                         strerror(errno));
                 return 1;
         }
-                                                                                                                                                                                                     
+
         if (strcasecmp(argv[1], "start") == 0) {
                 if (argc != 3) {
                         fprintf(stderr, debug_daemon_usage);
                         return 1;
                 }
-                                                                                                                                                                                                     
+
                 rc = write(fd, argv[2], strlen(argv[2]));
                 if (rc != strlen(argv[2])) {
                         fprintf(stderr, "write(%s) failed: %s\n", argv[2],
@@ -515,7 +530,7 @@ int jt_dbg_debug_daemon(int argc, char **argv)
                 fprintf(stderr, debug_daemon_usage);
                 return 1;
         }
-                                                                                                                                                                                                     
+
         close(fd);
         return 0;
 }
@@ -611,7 +626,6 @@ static struct mod_paths {
         {"obdfilter", "lustre/obdfilter"},
         {"extN", "lustre/extN"},
         {"lov", "lustre/lov"},
-        {"lmv", "lustre/lmv"},
         {"fsfilt_ext3", "lustre/lvfs"},
         {"fsfilt_extN", "lustre/lvfs"},
         {"fsfilt_reiserfs", "lustre/lvfs"},
@@ -623,13 +637,13 @@ static struct mod_paths {
         {"ptlbd", "lustre/ptlbd"},
         {"mgmt_svc", "lustre/mgmt"},
         {"mgmt_cli", "lustre/mgmt"},
-        {"cobd", "lustre/cobd"},
-        {"cmobd", "lustre/cmobd"},
+        {"conf_obd", "lustre/obdclass"},
         {NULL, NULL}
 };
 
 static int jt_dbg_modules_2_4(int argc, char **argv)
 {
+#ifdef HAVE_LINUX_VERSION_H
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         struct mod_paths *mp;
         char *path = "..";
@@ -665,9 +679,9 @@ static int jt_dbg_modules_2_4(int argc, char **argv)
         }
 
         return 0;
-#else /* Headers are 2.6-only */
+#endif /* Headers are 2.6-only */
+#endif /* !HAVE_LINUX_VERSION_H */
         return -EINVAL;
-#endif
 }
 
 static int jt_dbg_modules_2_5(int argc, char **argv)
index 1bde59f..d5d29dc 100644 (file)
 
 #include <stdio.h>
 #include <sys/types.h>
+#ifdef HAVE_NETDB_H
 #include <netdb.h>
+#endif
 #include <sys/socket.h>
+#ifdef HAVE_NETINET_TCP_H
 #include <netinet/tcp.h>
-#include <netdb.h>
+#endif
 #include <stdlib.h>
 #include <string.h>
 #include <fcntl.h>
+#include "ioctl.h"
 #include <sys/ioctl.h>
 #include <errno.h>
 #include <unistd.h>
@@ -54,10 +58,6 @@ unsigned int portal_printk;
 
 static unsigned int g_nal = 0;
 
-static int g_socket_txmem = 0;
-static int g_socket_rxmem = 0;
-static int g_socket_nonagle = 1;
-
 typedef struct
 {
         char *name;
@@ -70,6 +70,7 @@ static name2num_t nalnames[] = {
         {"elan",       QSWNAL},
         {"gm",         GMNAL},
         {"openib",      OPENIBNAL},
+        {"iib",         IIBNAL},
         {NULL,         -1}
 };
 
@@ -209,6 +210,7 @@ nal2name (int nal)
         return ((e == NULL) ? "???" : e->name);
 }
 
+#ifdef HAVE_GETHOSTBYNAME
 static struct hostent *
 ptl_gethostbyname(char * hname) {
         struct hostent *he;
@@ -229,6 +231,7 @@ ptl_gethostbyname(char * hname) {
         }
         return he;
 }
+#endif
 
 int
 ptl_parse_port (int *port, char *str)
@@ -295,7 +298,9 @@ ptl_parse_ipquad (__u32 *ipaddrp, char *str)
 int
 ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
 {
+#ifdef HAVE_GETHOSTBYNAME
         struct hostent *he;
+#endif
 
         if (!strcmp (str, "_all_")) 
         {
@@ -305,7 +310,8 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
 
         if (ptl_parse_ipquad(ipaddrp, str) == 0)
                 return (0);
-        
+
+#if HAVE_GETHOSTBYNAME        
         if ((('a' <= str[0] && str[0] <= 'z') ||
              ('A' <= str[0] && str[0] <= 'Z')) &&
              (he = ptl_gethostbyname (str)) != NULL)
@@ -315,6 +321,7 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
                 *ipaddrp = ntohl(addr);         /* HOST byte order */
                 return (0);
         }
+#endif
 
         return (-1);
 }
@@ -322,6 +329,7 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
 char *
 ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup)
 {
+#ifdef HAVE_GETHOSTBYNAME
         __u32           net_ip;
         struct hostent *he;
 
@@ -333,7 +341,8 @@ ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup)
                         return (str);
                 }
         }
-        
+#endif
+
         sprintf (str, "%d.%d.%d.%d",
                  (ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff,
                  (ipaddr >> 8) & 0xff, ipaddr & 0xff);
@@ -386,6 +395,7 @@ char *
 ptl_nid2str (char *buffer, ptl_nid_t nid)
 {
         __u64           nid64 = ptl_nid2u64(nid);
+#ifdef HAVE_GETHOSTBYNAME
         struct hostent *he = 0;
 
         /* Don't try to resolve NIDs that are e.g. Elan host IDs.  Assume
@@ -400,6 +410,7 @@ ptl_nid2str (char *buffer, ptl_nid_t nid)
         if (he != NULL)
                 sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name);
         else
+#endif /* HAVE_GETHOSTBYNAME */
                 sprintf(buffer, LPX64, nid64);
 
         return (buffer);
@@ -524,7 +535,6 @@ int jt_ptl_network(int argc, char **argv)
         return (-1);
 }
 
-
 int
 jt_ptl_print_interfaces (int argc, char **argv)
 {
@@ -563,6 +573,9 @@ jt_ptl_add_interface (int argc, char **argv)
         __u32                    ipaddr;
         int                      rc;
         __u32                    netmask = 0xffffff00;
+        int                      i;
+        int                      count;
+        char                    *end;
 
         if (argc < 2 || argc > 3) {
                 fprintf (stderr, "usage: %s ipaddr [netmask]\n", argv[0]);
@@ -576,13 +589,19 @@ jt_ptl_add_interface (int argc, char **argv)
                 fprintf (stderr, "Can't parse ip: %s\n", argv[1]);
                 return -1;
         }
-        
-        if (argc > 2 &&
-            ptl_parse_ipquad(&netmask, argv[2]) != 0) {
-                fprintf (stderr, "Can't parse netmask: %s\n", argv[2]);
-                return -1;
+
+        if (argc > 2 ) {
+                count = strtol(argv[2], &end, 0);
+                if (count > 0 && count < 32 && *end == 0) {
+                        netmask = 0;
+                        for (i = count; i > 0; i--)
+                                netmask = netmask|(1<<(32-i));
+                } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) {
+                        fprintf (stderr, "Can't parse netmask: %s\n", argv[2]);
+                        return -1;
+                }
         }
-        
+
         PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE);
         pcfg.pcfg_id     = ipaddr;
         pcfg.pcfg_misc   = netmask;
@@ -593,7 +612,7 @@ jt_ptl_add_interface (int argc, char **argv)
                          strerror (errno));
                 return -1;
         }
-        
+
         return 0;
 }
 
@@ -627,11 +646,11 @@ jt_ptl_del_interface (int argc, char **argv)
                          strerror (errno));
                 return -1;
         }
-        
+
         return 0;
 }
 
-int 
+int
 jt_ptl_print_peers (int argc, char **argv)
 {
         struct portals_cfg       pcfg;
@@ -639,7 +658,7 @@ jt_ptl_print_peers (int argc, char **argv)
         int                      index;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return -1;
 
         for (index = 0;;index++) {
@@ -675,7 +694,7 @@ jt_ptl_add_peer (int argc, char **argv)
         int                      port = 0;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return -1;
 
         if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
@@ -685,7 +704,7 @@ jt_ptl_add_peer (int argc, char **argv)
                         return 0;
                 }
         } else if (argc != 2) {
-                fprintf (stderr, "usage(openib): %s nid\n", argv[0]);
+                fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]);
                 return 0;
         }
 
@@ -732,7 +751,7 @@ jt_ptl_del_peer (int argc, char **argv)
         int                      argidx;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return -1;
 
         if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
@@ -764,7 +783,7 @@ jt_ptl_del_peer (int argc, char **argv)
         }
         
         if (argc > argidx) {
-                if (!strcmp (argv[3], "single_share")) {
+                if (!strcmp (argv[argidx], "single_share")) {
                         single_share = 1;
                 } else {
                         fprintf (stderr, "Unrecognised arg %s'\n", argv[3]);
@@ -795,7 +814,7 @@ jt_ptl_print_connections (int argc, char **argv)
         int                      index;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return -1;
 
         for (index = 0;;index++) {
@@ -832,13 +851,19 @@ jt_ptl_print_connections (int argc, char **argv)
 
 int jt_ptl_connect(int argc, char **argv)
 {
+#ifndef HAVE_CONNECT
+        /* no connect() support */
+        return -1;
+#else /* HAVE_CONNECT */
         struct portals_cfg pcfg;
         struct sockaddr_in srvaddr;
+        struct sockaddr_in locaddr;
         __u32 ipaddr;
         char *flag;
         int fd, rc;
         int type = SOCKNAL_CONN_ANY;
-        int port;
+        int port, rport;
+        int o;
 
         if (argc < 3) {
                 fprintf(stderr, "usage: %s ip port [type]\n", argv[0]);
@@ -893,20 +918,48 @@ int jt_ptl_connect(int argc, char **argv)
                                 return (-1);
                         }
 
+        memset(&locaddr, 0, sizeof(locaddr)); 
+        locaddr.sin_family = AF_INET; 
+        locaddr.sin_addr.s_addr = INADDR_ANY;
+
         memset(&srvaddr, 0, sizeof(srvaddr));
         srvaddr.sin_family = AF_INET;
         srvaddr.sin_port = htons(port);
         srvaddr.sin_addr.s_addr = htonl(ipaddr);
 
-        fd = socket(PF_INET, SOCK_STREAM, 0);
-        if ( fd < 0 ) {
-                fprintf(stderr, "socket() failed: %s\n", strerror(errno));
-                return -1;
+
+        for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
+                fd = socket(PF_INET, SOCK_STREAM, 0); 
+                if ( fd < 0 ) { 
+                        fprintf(stderr, "socket() failed: %s\n", strerror(errno)); 
+                        return -1; 
+                }
+
+                o = 1;
+                rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 
+                                &o, sizeof(o));
+                
+                locaddr.sin_port = htons(rport);
+                rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); 
+                if (rc == 0 || errno == EACCES) {
+                        rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+                        if (rc == 0) {
+                                break;
+                        } else if (errno != EADDRINUSE) {
+                                fprintf(stderr, "Error connecting to host: %s\n", strerror(errno));
+                                close(fd);
+                                return -1;
+                        }
+                } else if (errno != EADDRINUSE) {
+                        fprintf(stderr, "Error binding to port %d: %d: %s\n", port, errno, strerror(errno));
+                        close(fd);
+                        return -1;
+                }
         }
 
-        rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
-        if ( rc == -1 ) { 
-                fprintf(stderr, "connect() failed: %s\n", strerror(errno));
+        if (rport == IPPORT_RESERVED / 2) {
+                fprintf(stderr,
+                        "Warning: all privileged ports are in use.\n"); 
                 return -1;
         }
 
@@ -937,6 +990,7 @@ int jt_ptl_connect(int argc, char **argv)
                 fprintf(stderr, "close failed: %d\n", rc);
 
         return 0;
+#endif /* HAVE_CONNECT */
 }
 
 int jt_ptl_disconnect(int argc, char **argv)
@@ -951,7 +1005,7 @@ int jt_ptl_disconnect(int argc, char **argv)
                 return 0;
         }
 
-        if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, 0))
+        if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return 0;
 
         if (argc >= 2 &&
@@ -1491,11 +1545,11 @@ lwt_snapshot(cycles_t *now, int *ncpu, int *totalsize,
         }
 
         /* crappy overloads */
-        if (data.ioc_nid != sizeof(lwt_event_t) ||
-            data.ioc_nid2 != offsetof(lwt_event_t, lwte_where)) {
+        if (data.ioc_nid2 != sizeof(lwt_event_t) ||
+            data.ioc_nid3 != offsetof(lwt_event_t, lwte_where)) {
                 fprintf(stderr,"kernel/user LWT event mismatch %d(%d),%d(%d)\n",
-                        (int)data.ioc_nid, sizeof(lwt_event_t),
-                        (int)data.ioc_nid2,
+                        (int)data.ioc_nid2, sizeof(lwt_event_t),
+                        (int)data.ioc_nid3,
                         (int)offsetof(lwt_event_t, lwte_where));
                 return (-1);
         }
@@ -1573,12 +1627,21 @@ lwt_put_string(char *ustr)
 static int
 lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e)
 {
+#ifndef __WORDSIZE
+# error "__WORDSIZE not defined"
+#elif __WORDSIZE == 32
+# define XFMT "%#010lx"
+#elif __WORDSIZE== 64
+# define XFMT "%#018lx"
+#else
+# error "Unexpected __WORDSIZE"
+#endif
         char           *where = lwt_get_string(e->lwte_where);
 
         if (where == NULL)
                 return (-1);
 
-        fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n",
+        fprintf(f, XFMT" "XFMT" "XFMT" "XFMT": "XFMT" %2d %10.6f %10.2f %s\n",
                 e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4,
                 (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0),
                 (t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz,
@@ -1587,6 +1650,7 @@ lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t
         lwt_put_string(where);
 
         return (0);
+#undef XFMT
 }
 
 double
index e2fca96..fe97687 100644 (file)
@@ -218,6 +218,8 @@ portals/knals/Makefile
 portals/knals/autoMakefile
 portals/knals/gmnal/Makefile
 portals/knals/gmnal/autoMakefile
+portals/knals/iibnal/Makefile
+portals/knals/iibnal/autoMakefile
 portals/knals/openibnal/Makefile
 portals/knals/openibnal/autoMakefile
 portals/knals/qswnal/Makefile
index c47d1ac..5ea2c92 100644 (file)
@@ -170,7 +170,7 @@ Index: linux-2.4.24-b1_4/include/linux/kallsyms.h
 +   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 + */
 +
-+#ident "$Id: kksymoops-2.4.24.vanilla.patch,v 1.9 2004/10/24 17:00:18 yury Exp $"
++#ident "$Id: kksymoops-2.4.24.vanilla.patch,v 1.10 2004/10/29 15:04:35 eeb Exp $"
 +
 +#ifndef MODUTILS_KALLSYMS_H
 +#define MODUTILS_KALLSYMS_H 1
index d2bd1a1..021fa68 100644 (file)
@@ -14,26 +14,107 @@ AC_MSG_RESULT([$enable_inkernel])
 AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes)
 
 # -------- are we building against an external portals? -------
-AC_MSG_CHECKING([if Cray portals should be used])
+AC_MSG_CHECKING([for Cray portals])
 AC_ARG_WITH([cray-portals],
        AC_HELP_STRING([--with-cray-portals=path],
                       [path to cray portals]),
        [
                if test "$with_cray_portals" != no; then
-                       if test -r $with_cray_portals/include/portals/api.h ; then
-                               CRAY_PORTALS_PATH=$with_cray_portals
-                               CRAY_PORTALS_INCLUDE="-I$with_cray_portals/include"
-                               AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals])
-                       else
-                               AC_MSG_ERROR([--with-cray-portals specified badly])
-                       fi
-               fi
+                       CRAY_PORTALS_PATH=$with_cray_portals
+                       CRAY_PORTALS_INCLUDES="$with_cray_portals/include"
+                       CRAY_PORTALS_LIBS="$with_cray_portals"
+                fi
        ],[with_cray_portals=no])
 AC_SUBST(CRAY_PORTALS_PATH)
-AC_MSG_RESULT([$with_cray_portals])
+AC_MSG_RESULT([$CRAY_PORTALS_PATH])
+
+AC_MSG_CHECKING([for Cray portals includes])
+AC_ARG_WITH([cray-portals-includes],
+       AC_HELP_STRING([--with-cray-portals-includes=path],
+                      [path to cray portals includes]),
+       [
+               if test "$with_cray_portals_includes" != no; then
+                       CRAY_PORTALS_INCLUDES="$with_cray_portals_includes"
+                fi
+       ])
+AC_SUBST(CRAY_PORTALS_INCLUDES)
+AC_MSG_RESULT([$CRAY_PORTALS_INCLUDES])
+
+AC_MSG_CHECKING([for Cray portals libs])
+AC_ARG_WITH([cray-portals-libs],
+       AC_HELP_STRING([--with-cray-portals-libs=path],
+                      [path to cray portals libs]),
+       [
+               if test "$with_cray_portals_libs" != no; then
+                       CRAY_PORTALS_LIBS="$with_cray_portals_libs"
+                fi
+       ])
+AC_SUBST(CRAY_PORTALS_LIBS)
+AC_MSG_RESULT([$CRAY_PORTALS_LIBS])
+
+if test x$CRAY_PORTALS_INCLUDES != x ; then
+       if test ! -r $CRAY_PORTALS_INCLUDES/portals/api.h ; then
+               AC_MSG_ERROR([Cray portals headers were not found in $CRAY_PORTALS_INCLUDES.  Please check the paths passed to --with-cray-portals or --with-cray-portals-includes.])
+       fi
+fi
+if test x$CRAY_PORTALS_LIBS != x ; then
+       if test ! -r $CRAY_PORTALS_LIBS/libportals.a ; then
+               AC_MSG_ERROR([Cray portals libraries were not found in $CRAY_PORTALS_LIBS.  Please check the paths passed to --with-cray-portals or --with-cray-portals-libs.])
+       fi
+fi
 
+AC_MSG_CHECKING([whether to use Cray portals])
+if test x$CRAY_PORTALS_INCLUDES != x -a x$CRAY_PORTALS_LIBS != x ; then
+       with_cray_portals=yes
+       AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals])
+       CRAY_PORTALS_INCLUDES="-I$CRAY_PORTALS_INCLUDES"
+else
+       with_cray_portals=no
+fi
+AC_MSG_RESULT([$with_cray_portals])
 AM_CONDITIONAL(CRAY_PORTALS, test x$with_cray_portals != xno)
 
+# ----------------------------------------
+# some tests for catamount-like systems
+# ----------------------------------------
+AC_ARG_ENABLE([sysio_init],
+       AC_HELP_STRING([--disable-sysio-init],
+               [call sysio init functions when initializing liblustre]),
+       [],[enable_sysio_init=yes])
+AC_MSG_CHECKING([whether to initialize libsysio])
+AC_MSG_RESULT([$enable_sysio_init])
+if test x$enable_sysio_init != xno ; then
+       AC_DEFINE([INIT_SYSIO], 1, [call sysio init functions])
+fi
+
+AC_ARG_ENABLE([urandom],
+       AC_HELP_STRING([--disable-urandom],
+               [disable use of /dev/urandom for liblustre]),
+       [],[enable_urandom=yes])
+AC_MSG_CHECKING([whether to use /dev/urandom for liblustre])
+AC_MSG_RESULT([$enable_urandom])
+if test x$enable_urandom != xno ; then
+       AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data])
+fi
+
+# -------- check for -lcap and -lpthread ----
+if test x$enable_liblustre = xyes ; then
+       AC_CHECK_LIB([cap], [cap_get_proc],
+               [
+                       CAP_LIBS="-lcap"
+                       AC_DEFINE([HAVE_LIBCAP], 1, [use libcap])
+               ],
+               [CAP_LIBS=""])
+       AC_SUBST(CAP_LIBS)
+       AC_CHECK_LIB([pthread], [pthread_create],
+               [
+                       PTHREAD_LIBS="-lpthread"
+                       AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread])
+               ],
+               [PTHREAD_LIBS=""])
+       AC_SUBST(PTHREAD_LIBS)
+fi
+
 # -------- enable tests and utils? -------
 if test x$enable_tests = xno ; then
        AC_MSG_NOTICE([disabling tests])
@@ -128,7 +209,7 @@ AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno)
 
 # -------  Makeflags ------------------
 
-CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
+CPPFLAGS="$CPPFLAGS $CRAY_PORTALS_INCLUDES -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
 
 # liblustre are all the same
 LLCPPFLAGS="-D__arch_lib__ -D_LARGEFILE64_SOURCE=1"
@@ -146,7 +227,7 @@ if test x$enable_ldiskfs = xyes ; then
        AC_DEFINE(CONFIG_LDISKFS_FS_SECURITY, 1, [enable fs security])
 fi
 
-EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I$PWD/portals/include -I$PWD/include"
+EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDES -I$PWD/portals/include -I$PWD/include"
 
 # these are like AC_TRY_COMPILE, but try to build modules against the
 # kernel, inside the kernel-tests directory
@@ -408,6 +489,35 @@ if test x$enable_modules != xno ; then
        AC_SUBST(OPENIBCPPFLAGS)
        AC_SUBST(OPENIBNAL)
 
+       #### Infinicon IB
+       AC_MSG_CHECKING([if Infinicon IB kernel headers are present])
+       # for how the only infinicon ib build has headers in /usr/include/iba
+       IIBCPPFLAGS="-I/usr/include -DIN_TREE_BUILD"
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS $IIBCPPFLAGS"
+       LUSTRE_MODULE_TRY_COMPILE(
+               [
+                       #include <linux/iba/ibt.h>
+               ],[
+                       IBT_INTERFACE_UNION interfaces;
+                       FSTATUS             rc;
+
+                       rc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2,
+                                                     &interfaces);
+
+                       return rc == FSUCCESS ? 0 : 1;
+               ],[
+                       AC_MSG_RESULT([yes])
+                       IIBNAL="iibnal"
+               ],[
+                       AC_MSG_RESULT([no])
+                       IIBNAL=""
+                       IIBCPPFLAGS=""
+               ])
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+       AC_SUBST(IIBCPPFLAGS)
+       AC_SUBST(IIBNAL)
+
        # ---------- Red Hat 2.4.18 has iobuf->dovary --------------
        # But other kernels don't
 
@@ -667,15 +777,34 @@ fi
 AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal")
 AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal")
 AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal")
+AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal")
+
+# portals/utils/portals.c
+AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h])
+AC_CHECK_FUNCS([gethostbyname socket connect])
+
+# portals/utils/debug.c
+AC_CHECK_HEADERS([linux/version.h])
+
+# include/liblustre.h
+AC_CHECK_HEADERS([asm/page.h sys/user.h stdint.h])
+
+# liblustre/llite_lib.h
+AC_CHECK_HEADERS([xtio.h file.h])
+
+# liblustre/dir.c
+AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h])
+
+# liblustre/lutil.c
+AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h])
+AC_CHECK_FUNCS([inet_ntoa])
 
 CPPFLAGS="-include \$(top_builddir)/include/config.h $CPPFLAGS"
 EXTRA_KCFLAGS="-include $PWD/include/config.h $EXTRA_KCFLAGS"
 AC_SUBST(EXTRA_KCFLAGS)
 
-#echo "KCPPFLAGS: $KCPPFLAGS"
-#echo "KCFLAGS: $KCFLAGS"
-#echo "LLCPPFLAGS: $LLCPPFLAGS"
-#echo "LLCFLAGS: $LLCFLAGS"
-#echo "MOD_LINK: $MOD_LINK"
-#echo "CFLAGS: $CFLAGS"
-#echo "CPPFLAGS: $CPPFLAGS"
+echo "CPPFLAGS: $CPPFLAGS"
+echo "LLCPPFLAGS: $LLCPPFLAGS"
+echo "CFLAGS: $CFLAGS"
+echo "EXTRA_KCFLAGS: $EXTRA_KCFLAGS"
+echo "LLCFLAGS: $LLCFLAGS"
index 861bb4a..f158396 100644 (file)
@@ -61,6 +61,13 @@ case "$CC_VERSION" in
        "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)")
                bad_cc
                ;;
+       # unpatched 'gcc' on rh9.  miscompiles a
+       #        struct = (type) { .member = value, };
+       # asignment in the iibnal where the struct is a mix
+       # of u64 and u32 bit-fields.
+       "gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5)")
+               bad_cc
+               ;;
        *)
                AC_MSG_RESULT([no known problems])
                ;;
@@ -116,3 +123,5 @@ else
        LIBWRAP=""
 fi
 AC_SUBST(LIBWRAP)
+
+AC_SUBST(LIBS)
diff --git a/lustre/portals/include/linux/.cvsignore b/lustre/portals/include/linux/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
index db63a08..4e24c71 100644 (file)
@@ -294,7 +294,6 @@ extern void kportal_blockallsigs (void);
 # include <unistd.h>
 # include <time.h>
 # include <limits.h>
-# include <asm/types.h>
 # ifndef DEBUG_SUBSYSTEM
 #  define DEBUG_SUBSYSTEM S_UNDEFINED
 # endif
@@ -320,6 +319,11 @@ void portals_debug_dumplog(void);
     printf("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format,                    \
            (subsys), (mask), (long)time(0), file, fn, line,                   \
            getpid() , stack, ## a);
+
+#undef CWARN
+#undef CERROR
+#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
 #endif
 
 /* support decl needed both by kernel and liblustre */
@@ -338,6 +342,16 @@ char *portals_id2str(int nal, ptl_process_id_t nid, char *str);
 #define LWT_MEMORY   (16<<20)
 
 #if !KLWT_SUPPORT
+# if defined(__KERNEL__)
+#  if !defined(BITS_PER_LONG)
+#   error "BITS_PER_LONG not defined"
+#  endif
+# elif !defined(__WORDSIZE)
+#  error "__WORDSIZE not defined"
+# else
+#  define BITS_PER_LONG __WORDSIZE
+# endif
+
 /* kernel hasn't defined this? */
 typedef struct {
         long long   lwte_when;
@@ -572,49 +586,42 @@ static inline int portal_ioctl_getdata(char *buf, char *end, void *arg)
         data = (struct portal_ioctl_data *)buf;
 
         err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
-        if ( err ) {
-                EXIT;
-                return err;
-        }
+        if (err)
+                RETURN(err);
 
         if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
-                CERROR ("PORTALS: version mismatch kernel vs application\n");
-                return -EINVAL;
+                CERROR("PORTALS: version mismatch kernel vs application\n");
+                RETURN(-EINVAL);
         }
 
         if (hdr->ioc_len + buf >= end) {
-                CERROR ("PORTALS: user buffer exceeds kernel buffer\n");
-                return -EINVAL;
+                CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+                RETURN(-EINVAL);
         }
 
 
         if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
-                CERROR ("PORTALS: user buffer too small for ioctl\n");
-                return -EINVAL;
+                CERROR("PORTALS: user buffer too small for ioctl\n");
+                RETURN(-EINVAL);
         }
 
         err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
-        if ( err ) {
-                EXIT;
-                return err;
-        }
+        if (err)
+                RETURN(err);
 
         if (portal_ioctl_is_invalid(data)) {
-                CERROR ("PORTALS: ioctl not correctly formatted\n");
-                return -EINVAL;
+                CERROR("PORTALS: ioctl not correctly formatted\n");
+                RETURN(-EINVAL);
         }
 
-        if (data->ioc_inllen1) {
+        if (data->ioc_inllen1)
                 data->ioc_inlbuf1 = &data->ioc_bulk[0];
-        }
 
-        if (data->ioc_inllen2) {
+        if (data->ioc_inllen2)
                 data->ioc_inlbuf2 = &data->ioc_bulk[0] +
                         size_round(data->ioc_inllen1);
-        }
 
-        EXIT;
-        return 0;
+        RETURN(0);
 }
 #endif
 
@@ -645,10 +652,11 @@ enum {
         TCPNAL    = 5,
         ROUTER    = 6,
         OPENIBNAL = 7,
+        IIBNAL    = 8,
         NAL_ENUM_END_MARKER
 };
 
-#define PTL_NALFMT_SIZE              30 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+4+1) */
+#define PTL_NALFMT_SIZE             32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */
 
 #define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
 
index d1a5c44..8317f14 100644 (file)
@@ -4,7 +4,11 @@
 #ifndef _LIBCFS_H
 #define _LIBCFS_H
 
+#ifdef HAVE_ASM_TYPES_H
 #include <asm/types.h>
+#else
+#include "types.h"
+#endif
 
 #ifdef __KERNEL__
 # include <linux/time.h>
@@ -62,7 +66,6 @@ extern unsigned int portal_stack;
 extern unsigned int portal_debug;
 extern unsigned int portal_printk;
 
-#include <asm/types.h>
 struct ptldebug_header {
         __u32 ph_len;
         __u32 ph_flags;
@@ -102,7 +105,7 @@ struct ptldebug_header {
 #define S_GMNAL       0x00080000
 #define S_PTLROUTER   0x00100000
 #define S_COBD        0x00200000
-#define S_OPENIBNAL   0x00400000
+#define S_IBNAL       0x00400000 /* All IB NALs */
 #define S_SM          0x00800000
 #define S_ASOBD       0x01000000
 #define S_LMV         0x02000000
@@ -185,8 +188,40 @@ do {                                                                          \
                                   CDEBUG_STACK, format, ## a);                \
 } while (0)
 
-#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
-#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
+#define CDEBUG_MAX_LIMIT 600
+#define CDEBUG_LIMIT(cdebug_mask, cdebug_format, a...)                        \
+do {                                                                          \
+        static unsigned long cdebug_next;                                     \
+        static int cdebug_count, cdebug_delay = 1;                            \
+                                                                              \
+        CHECK_STACK(CDEBUG_STACK);                                            \
+        if (time_after(jiffies, cdebug_next)) {                               \
+                portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, __FILE__,     \
+                                  __FUNCTION__, __LINE__, CDEBUG_STACK,       \
+                                  cdebug_format, ## a);                       \
+                if (cdebug_count) {                                           \
+                        portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask,       \
+                                          __FILE__, __FUNCTION__, __LINE__,   \
+                                          CDEBUG_STACK, cdebug_format, ## a); \
+                        cdebug_count = 0;                                     \
+                }                                                             \
+                if (time_after(jiffies, cdebug_next+(CDEBUG_MAX_LIMIT+10)*HZ))\
+                        cdebug_delay = cdebug_delay > 8 ? cdebug_delay/8 : 1; \
+                else                                                          \
+                        cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ ?\
+                                        CDEBUG_MAX_LIMIT*HZ : cdebug_delay*2; \
+                cdebug_next = jiffies + cdebug_delay;                         \
+        } else {                                                              \
+                portals_debug_msg(DEBUG_SUBSYSTEM,                            \
+                                  portal_debug & ~(D_EMERG|D_ERROR|D_WARNING),\
+                                  __FILE__, __FUNCTION__, __LINE__,           \
+                                  CDEBUG_STACK, cdebug_format, ## a);         \
+                cdebug_count++;                                               \
+        }                                                                     \
+} while (0)
+
+#define CWARN(format, a...) CDEBUG_LIMIT(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG_LIMIT(D_ERROR, format, ## a)
 #define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a)
 
 #define GOTO(label, rc)                                                 \
@@ -229,14 +264,13 @@ do {                                                                    \
 /* initial pid  */
 # if CRAY_PORTALS
 /* 
+ * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this
+ *    is too big.
  *
- * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this is too
- * big.
- *
- * 2) the implementation of ernal in cray portals further restricts the pid space
- * that may be used to 0 <= pid <= 255 (an 8 bit value).  Returns an error at nal
- * init time for any pid outside this range.  Other nals in cray portals don't have
- * this restriction.
+ * 2) the implementation of ernal in cray portals further restricts the pid
+ *    space that may be used to 0 <= pid <= 255 (an 8 bit value).  Returns
+ *    an error at nal init time for any pid outside this range.  Other nals
+ *    in cray portals don't have this restriction.
  * */
 #define LUSTRE_PTL_PID          9
 # else
index 7fe6dfc..5a43a45 100644 (file)
@@ -28,6 +28,8 @@
   call_usermodehelper(path, argv, envp, 1)
 # define RECALC_SIGPENDING         recalc_sigpending()
 # define CURRENT_SECONDS           get_seconds()
+# define smp_num_cpus              NR_CPUS
+
 
 #elif defined(CONFIG_RH_2_4_20) /* RH 2.4.x */
 
diff --git a/lustre/portals/include/portals/.cvsignore b/lustre/portals/include/portals/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
index 5db1352..c219d2a 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef _BUILD_CHECK_H
 #define _BUILD_CHECK_H
 
-#ifdef CRAY_PORTALS
+#if CRAY_PORTALS
 #error "an application got to me instead of cray's includes"
 #endif
 
index a81a371..cfddde2 100644 (file)
@@ -31,8 +31,6 @@
 #define PORTALS_DEV_PATH "/dev/portals"
 #define OBD_DEV_ID 1
 #define OBD_DEV_PATH "/dev/obd"
-#define SMFS_DEV_ID  2
-#define SMFS_DEV_PATH "/dev/snapdev"
 
 int ptl_name2nal(char *str);
 int ptl_parse_ipaddr (__u32 *ipaddrp, char *str);
@@ -41,9 +39,6 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid);
 
 int ptl_initialize(int argc, char **argv);
 int jt_ptl_network(int argc, char **argv);
-int jt_ptl_print_autoconnects (int argc, char **argv);
-int jt_ptl_add_autoconnect (int argc, char **argv);
-int jt_ptl_del_autoconnect (int argc, char **argv);
 int jt_ptl_print_interfaces(int argc, char **argv);
 int jt_ptl_add_interface(int argc, char **argv);
 int jt_ptl_del_interface(int argc, char **argv);
@@ -62,9 +57,6 @@ int jt_ptl_add_uuid(int argc, char **argv);
 int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
 int jt_ptl_close_uuid(int argc, char **argv);
 int jt_ptl_del_uuid(int argc, char **argv);
-int jt_ptl_rxmem (int argc, char **argv);
-int jt_ptl_txmem (int argc, char **argv);
-int jt_ptl_nagle (int argc, char **argv);
 int jt_ptl_add_route (int argc, char **argv);
 int jt_ptl_del_route (int argc, char **argv);
 int jt_ptl_notify_router (int argc, char **argv);
index 2a01119..9763d14 100644 (file)
@@ -1,5 +1,6 @@
 @BUILD_GMNAL_TRUE@subdir-m += gmnal
 @BUILD_OPENIBNAL_TRUE@subdir-m += openibnal
+@BUILD_IIBNAL_TRUE@subdir-m += iibnal
 @BUILD_QSWNAL_TRUE@subdir-m += qswnal
 subdir-m += socknal
 
index 002c169..0090364 100644 (file)
@@ -3,4 +3,4 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-SUBDIRS = gmnal openibnal qswnal socknal 
+SUBDIRS = gmnal iibnal openibnal qswnal socknal 
diff --git a/lustre/portals/knals/iibnal/.cvsignore b/lustre/portals/knals/iibnal/.cvsignore
new file mode 100644 (file)
index 0000000..5ed596b
--- /dev/null
@@ -0,0 +1,10 @@
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
diff --git a/lustre/portals/knals/iibnal/Makefile.in b/lustre/portals/knals/iibnal/Makefile.in
new file mode 100644 (file)
index 0000000..e7934e2
--- /dev/null
@@ -0,0 +1,6 @@
+MODULES := kiibnal
+kiibnal-objs := iibnal.o iibnal_cb.o
+
+EXTRA_POST_CFLAGS := @IIBCPPFLAGS@
+
+@INCLUDE_RULES@
diff --git a/lustre/portals/knals/iibnal/Makefile.mk b/lustre/portals/knals/iibnal/Makefile.mk
new file mode 100644 (file)
index 0000000..0459a20
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(src)/../../Kernelenv
+
+obj-y += kiibnal.o
+kiibnal-objs := iibnal.o iibnal_cb.o
+
diff --git a/lustre/portals/knals/iibnal/autoMakefile.am b/lustre/portals/knals/iibnal/autoMakefile.am
new file mode 100644 (file)
index 0000000..251df66
--- /dev/null
@@ -0,0 +1,15 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if !CRAY_PORTALS
+if BUILD_IIBNAL
+modulenet_DATA = kiibnal$(KMODEXT)
+endif
+endif
+endif
+
+MOSTLYCLEANFILES = *.o *.ko *.mod.c
+DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h
diff --git a/lustre/portals/knals/iibnal/iibnal.c b/lustre/portals/knals/iibnal/iibnal.c
new file mode 100644 (file)
index 0000000..09908c9
--- /dev/null
@@ -0,0 +1,1713 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+nal_t                   kibnal_api;
+ptl_handle_ni_t         kibnal_ni;
+kib_tunables_t          kibnal_tunables;
+
+kib_data_t              kibnal_data = {
+        .kib_service_id = IBNAL_SERVICE_NUMBER,
+};
+
+#ifdef CONFIG_SYSCTL
+#define IBNAL_SYSCTL             202
+
+#define IBNAL_SYSCTL_TIMEOUT     1
+
+static ctl_table kibnal_ctl_table[] = {
+        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
+         &kibnal_tunables.kib_io_timeout, sizeof (int),
+         0644, NULL, &proc_dointvec},
+        { 0 }
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+        {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
+        { 0 }
+};
+#endif
+
+#ifdef unused
+void
+print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+{
+        char name[32];
+
+        if (service == NULL) 
+        {
+                CWARN("tag       : %s\n"
+                      "status    : %d (NULL)\n", tag, rc);
+                return;
+        }
+        strncpy (name, service->ServiceName, sizeof(name)-1);
+        name[sizeof(name)-1] = 0;
+        
+        CWARN("tag       : %s\n"
+              "status    : %d\n"
+              "service id: "LPX64"\n"
+              "name      : %s\n"
+              "NID       : "LPX64"\n", tag, rc,
+              service->RID.ServiceID, name,
+              *kibnal_service_nid_field(service));
+}
+#endif
+
+static void
+kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
+                              FSTATUS frc, uint32 madrc)
+{
+        *(FSTATUS *)arg = frc;
+        up (&kibnal_data.kib_nid_signal);
+}
+
+#if IBNAL_CHECK_ADVERT
+static void
+kibnal_service_query_done (void *arg, QUERY *qry, 
+                           QUERY_RESULT_VALUES *qry_result)
+{
+        FSTATUS frc = qry_result->Status;
+
+        if (frc != FSUCCESS &&
+            qry_result->ResultDataSize == 0)
+                frc = FERROR;
+        
+        *(FSTATUS *)arg = frc;
+        up (&kibnal_data.kib_nid_signal);
+}
+
+static void
+kibnal_check_advert (void)
+{
+        QUERY                  *qry;
+        IB_SERVICE_RECORD      *svc;
+        FSTATUS                 frc;
+        FSTATUS                 frc2;
+
+        PORTAL_ALLOC(qry, sizeof(*qry));
+        if (qry == NULL)
+                return;
+
+        memset (qry, 0, sizeof(*qry));
+        qry->InputType = InputTypeServiceRecord;
+        qry->OutputType = OutputTypeServiceRecord;
+        qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+        svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+                                                    kibnal_data.kib_port_guid,
+                                                    qry,
+                                                    kibnal_service_query_done,
+                                                    NULL, &frc2);
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("Immediate error %d checking SM service\n", frc);
+        } else {
+                down (&kibnal_data.kib_nid_signal);
+                frc = frc2;
+
+                if (frc != 0)
+                        CERROR ("Error %d checking SM service\n", rc);
+        }
+
+        return (rc);
+}
+#endif
+
+static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
+{
+        IB_SERVICE_RECORD     *svc;
+
+        memset (fod, 0, sizeof(*fod));
+        fod->Type = type;
+
+        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+        svc->RID.ServiceID = kibnal_data.kib_service_id;
+        svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
+        svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
+        svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
+        svc->ServiceLease = 0xffffffff;
+
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+}
+
+static int
+kibnal_advertise (void)
+{
+        FABRIC_OPERATION_DATA *fod;
+        IB_SERVICE_RECORD     *svc;
+        FSTATUS                frc;
+        FSTATUS                frc2;
+
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC(fod, sizeof(*fod));
+        if (fod == NULL)
+                return (-ENOMEM);
+
+        fill_fod(fod, FabOpSetServiceRecord);
+        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+        CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
+               svc->RID.ServiceID, 
+               svc->ServiceName, *kibnal_service_nid_field(svc));
+
+        frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+                                            kibnal_data.kib_port_guid,
+                                            fod, kibnal_service_setunset_done, 
+                                            NULL, &frc2);
+
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("Immediate error %d advertising NID "LPX64"\n",
+                        frc, kibnal_data.kib_nid);
+                goto out;
+        }
+
+        down (&kibnal_data.kib_nid_signal);
+
+        frc = frc2;
+        if (frc != FSUCCESS)
+                CERROR ("Error %d advertising BUD "LPX64"\n",
+                        frc, kibnal_data.kib_nid);
+out:
+        PORTAL_FREE(fod, sizeof(*fod));
+        return (frc == FSUCCESS) ? 0 : -EINVAL;
+}
+
+static void
+kibnal_unadvertise (int expect_success)
+{
+        FABRIC_OPERATION_DATA *fod;
+        IB_SERVICE_RECORD     *svc;
+        FSTATUS                frc;
+        FSTATUS                frc2;
+
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC(fod, sizeof(*fod));
+        if (fod == NULL)
+                return;
+
+        fill_fod(fod, FabOpDeleteServiceRecord);
+        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+        CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
+               svc->ServiceName, *kibnal_service_nid_field(svc));
+        
+        frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+                                            kibnal_data.kib_port_guid,
+                                            fod, kibnal_service_setunset_done, 
+                                            NULL, &frc2);
+
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
+                        frc, kibnal_data.kib_nid);
+                goto out;
+        }
+
+        down (&kibnal_data.kib_nid_signal);
+
+        if ((frc2 == FSUCCESS) == !!expect_success)
+                goto out;
+
+        if (expect_success)
+                CERROR("Error %d unadvertising NID "LPX64"\n",
+                       frc2, kibnal_data.kib_nid);
+        else
+                CWARN("Removed conflicting NID "LPX64"\n",
+                      kibnal_data.kib_nid);
+ out:
+        PORTAL_FREE(fod, sizeof(*fod));
+}
+
+static int
+kibnal_set_mynid(ptl_nid_t nid)
+{
+        struct timeval tv;
+        lib_ni_t      *ni = &kibnal_lib.libnal_ni;
+        int            rc;
+        FSTATUS        frc;
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+               nid, ni->ni_pid.nid);
+
+        do_gettimeofday(&tv);
+
+        down (&kibnal_data.kib_nid_mutex);
+
+        if (nid == kibnal_data.kib_nid) {
+                /* no change of NID */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+
+        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
+               kibnal_data.kib_nid, nid);
+        
+        if (kibnal_data.kib_nid != PTL_NID_ANY) {
+
+                kibnal_unadvertise (1);
+
+                frc = iibt_cm_cancel(kibnal_data.kib_cep);
+                if (frc != FSUCCESS && frc != FPENDING)
+                        CERROR ("Error %d stopping listener\n", frc);
+
+                frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
+                if (frc != FSUCCESS)
+                        CERROR ("Error %d destroying CEP\n", frc);
+
+                kibnal_data.kib_cep = NULL;
+        }
+        
+        kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+        
+        /* Delete all existing peers and their connections after new
+         * NID/incarnation set to ensure no old connections in our brave
+         * new world. */
+        kibnal_del_peer (PTL_NID_ANY, 0);
+
+        if (kibnal_data.kib_nid == PTL_NID_ANY) {
+                /* No new NID to install */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+
+        /* remove any previous advert (crashed node etc) */
+        kibnal_unadvertise(0);
+
+        kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
+        if (kibnal_data.kib_cep == NULL) {
+                CERROR ("Can't create CEP\n");
+                rc = -ENOMEM;
+        } else {
+                CM_LISTEN_INFO info;
+                memset (&info, 0, sizeof(info));
+                info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
+
+                frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
+                                     kibnal_listen_callback, NULL);
+                if (frc != FSUCCESS && frc != FPENDING) {
+                        CERROR ("iibt_cm_listen error: %d\n", frc);
+                        rc = -EINVAL;
+                } else {
+                        rc = 0;
+                }
+        }
+        
+        if (rc == 0) {
+                rc = kibnal_advertise();
+                if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+                        kibnal_check_advert();
+#endif
+                        up (&kibnal_data.kib_nid_mutex);
+                        return (0);
+                }
+                
+                iibt_cm_cancel (kibnal_data.kib_cep);
+                iibt_cm_destroy_cep (kibnal_data.kib_cep);
+                /* remove any peers that sprung up while I failed to
+                 * advertise myself */
+                kibnal_del_peer (PTL_NID_ANY, 0);
+        }
+
+        kibnal_data.kib_nid = PTL_NID_ANY;
+        up (&kibnal_data.kib_nid_mutex);
+        return (rc);
+}
+
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
+{
+        kib_peer_t *peer;
+
+        LASSERT (nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC (peer, sizeof (*peer));
+        if (peer == NULL)
+                return (NULL);
+
+        memset(peer, 0, sizeof(*peer));         /* zero flags etc */
+
+        peer->ibp_nid = nid;
+        atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
+
+        INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
+        INIT_LIST_HEAD (&peer->ibp_conns);
+        INIT_LIST_HEAD (&peer->ibp_tx_queue);
+
+        peer->ibp_reconnect_time = jiffies;
+        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+        atomic_inc (&kibnal_data.kib_npeers);
+        return (peer);
+}
+
+void
+kibnal_destroy_peer (kib_peer_t *peer)
+{
+
+        LASSERT (atomic_read (&peer->ibp_refcount) == 0);
+        LASSERT (peer->ibp_persistence == 0);
+        LASSERT (!kibnal_peer_active(peer));
+        LASSERT (peer->ibp_connecting == 0);
+        LASSERT (list_empty (&peer->ibp_conns));
+        LASSERT (list_empty (&peer->ibp_tx_queue));
+
+        PORTAL_FREE (peer, sizeof (*peer));
+
+        /* NB a peer's connections keep a reference on their peer until
+         * they are destroyed, so we can be assured that _all_ state to do
+         * with this peer has been cleaned up when its refcount drops to
+         * zero. */
+        atomic_dec (&kibnal_data.kib_npeers);
+}
+
+/* the caller is responsible for accounting for the additional reference
+ * that this creates */
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
+{
+        struct list_head *peer_list = kibnal_nid2peerlist (nid);
+        struct list_head *tmp;
+        kib_peer_t       *peer;
+
+        list_for_each (tmp, peer_list) {
+
+                peer = list_entry (tmp, kib_peer_t, ibp_list);
+
+                LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
+                         peer->ibp_connecting != 0 || /* creating conns */
+                         !list_empty (&peer->ibp_conns));  /* active conn */
+
+                if (peer->ibp_nid != nid)
+                        continue;
+
+                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
+                       peer, nid, atomic_read (&peer->ibp_refcount));
+                return (peer);
+        }
+        return (NULL);
+}
+
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
+{
+        kib_peer_t     *peer;
+
+        read_lock (&kibnal_data.kib_global_lock);
+        peer = kibnal_find_peer_locked (nid);
+        if (peer != NULL)                       /* +1 ref for caller? */
+                kib_peer_addref(peer);
+        read_unlock (&kibnal_data.kib_global_lock);
+
+        return (peer);
+}
+
+void
+kibnal_unlink_peer_locked (kib_peer_t *peer)
+{
+        LASSERT (peer->ibp_persistence == 0);
+        LASSERT (list_empty(&peer->ibp_conns));
+
+        LASSERT (kibnal_peer_active(peer));
+        list_del_init (&peer->ibp_list);
+        /* lose peerlist's ref */
+        kib_peer_decref(peer);
+}
+
+static int
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+{
+        kib_peer_t        *peer;
+        struct list_head  *ptmp;
+        int                i;
+
+        read_lock (&kibnal_data.kib_global_lock);
+
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (index-- > 0)
+                                continue;
+
+                        *nidp = peer->ibp_nid;
+                        *persistencep = peer->ibp_persistence;
+
+                        read_unlock (&kibnal_data.kib_global_lock);
+                        return (0);
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+        return (-ENOENT);
+}
+
+static int
+kibnal_add_persistent_peer (ptl_nid_t nid)
+{
+        unsigned long      flags;
+        kib_peer_t        *peer;
+        kib_peer_t        *peer2;
+        
+        if (nid == PTL_NID_ANY)
+                return (-EINVAL);
+
+        peer = kibnal_create_peer (nid);
+        if (peer == NULL)
+                return (-ENOMEM);
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        peer2 = kibnal_find_peer_locked (nid);
+        if (peer2 != NULL) {
+                kib_peer_decref (peer);
+                peer = peer2;
+        } else {
+                /* peer table takes existing ref on peer */
+                list_add_tail (&peer->ibp_list,
+                               kibnal_nid2peerlist (nid));
+        }
+
+        peer->ibp_persistence++;
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        return (0);
+}
+
+static void
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+{
+        struct list_head *ctmp;
+        struct list_head *cnxt;
+        kib_conn_t       *conn;
+
+        if (!single_share)
+                peer->ibp_persistence = 0;
+        else if (peer->ibp_persistence > 0)
+                peer->ibp_persistence--;
+
+        if (peer->ibp_persistence != 0)
+                return;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                kibnal_close_conn_locked (conn, 0);
+        }
+
+        /* NB peer unlinks itself when last conn is closed */
+}
+
+int
+kibnal_del_peer (ptl_nid_t nid, int single_share)
+{
+        unsigned long      flags;
+        struct list_head  *ptmp;
+        struct list_head  *pnxt;
+        kib_peer_t        *peer;
+        int                lo;
+        int                hi;
+        int                i;
+        int                rc = -ENOENT;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        if (nid != PTL_NID_ANY)
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+        else {
+                lo = 0;
+                hi = kibnal_data.kib_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+                                continue;
+
+                        kibnal_del_peer_locked (peer, single_share);
+                        rc = 0;         /* matched something */
+
+                        if (single_share)
+                                goto out;
+                }
+        }
+ out:
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        return (rc);
+}
+
+static kib_conn_t *
+kibnal_get_conn_by_idx (int index)
+{
+        kib_peer_t        *peer;
+        struct list_head  *ptmp;
+        kib_conn_t        *conn;
+        struct list_head  *ctmp;
+        int                i;
+
+        read_lock (&kibnal_data.kib_global_lock);
+
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence > 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        list_for_each (ctmp, &peer->ibp_conns) {
+                                if (index-- > 0)
+                                        continue;
+
+                                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+                                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                                       atomic_read (&conn->ibc_refcount));
+                                atomic_inc (&conn->ibc_refcount);
+                                read_unlock (&kibnal_data.kib_global_lock);
+                                return (conn);
+                        }
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+        return (NULL);
+}
+
+kib_conn_t *
+kibnal_create_conn (void)
+{
+        kib_conn_t  *conn;
+        int          i;
+        __u64        vaddr = 0;
+        __u64        vaddr_base;
+        int          page_offset;
+        int          ipage;
+        int          rc;
+        FSTATUS      frc;
+        union {
+                IB_QP_ATTRIBUTES_CREATE    qp_create;
+                IB_QP_ATTRIBUTES_MODIFY    qp_attr;
+        } params;
+        
+        PORTAL_ALLOC (conn, sizeof (*conn));
+        if (conn == NULL) {
+                CERROR ("Can't allocate connection\n");
+                return (NULL);
+        }
+
+        /* zero flags, NULL pointers etc... */
+        memset (conn, 0, sizeof (*conn));
+
+        INIT_LIST_HEAD (&conn->ibc_tx_queue);
+        INIT_LIST_HEAD (&conn->ibc_active_txs);
+        spin_lock_init (&conn->ibc_lock);
+        
+        atomic_inc (&kibnal_data.kib_nconns);
+        /* well not really, but I call destroy() on failure, which decrements */
+
+        PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+        if (conn->ibc_rxs == NULL)
+                goto failed;
+        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+        rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
+        if (rc != 0)
+                goto failed;
+
+        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
+
+        for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+                struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+                kib_rx_t   *rx = &conn->ibc_rxs[i];
+
+                rx->rx_conn = conn;
+                rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
+                             page_offset);
+
+                if (kibnal_whole_mem()) 
+                        rx->rx_vaddr = kibnal_page2phys(page) + 
+                                       page_offset + 
+                                       kibnal_data.kib_md.md_addr;
+                else
+                        rx->rx_vaddr = vaddr;
+                
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
+                
+                page_offset += IBNAL_MSG_SIZE;
+                LASSERT (page_offset <= PAGE_SIZE);
+
+                if (page_offset == PAGE_SIZE) {
+                        page_offset = 0;
+                        ipage++;
+                        LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
+                }
+        }
+
+        params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
+                .Type                    = QPTypeReliableConnected,
+                .SendQDepth              = IBNAL_TX_MAX_SG * 
+                                           IBNAL_MSG_QUEUE_SIZE,
+                .RecvQDepth              = IBNAL_MSG_QUEUE_SIZE,
+                .SendDSListDepth         = 1,
+                .RecvDSListDepth         = 1,
+                .SendCQHandle            = kibnal_data.kib_cq,
+                .RecvCQHandle            = kibnal_data.kib_cq,
+                .PDHandle                = kibnal_data.kib_pd,
+                .SendSignaledCompletions = TRUE,
+        };
+        frc = iibt_qp_create(kibnal_data.kib_hca, &params.qp_create, NULL,
+                             &conn->ibc_qp, &conn->ibc_qp_attrs);
+        if (rc != 0) {
+                CERROR ("Failed to create queue pair: %d\n", rc);
+                goto failed;
+        }
+
+        /* Mark QP created */
+        conn->ibc_state = IBNAL_CONN_INIT_QP;
+
+        params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState             = QPStateInit,
+                .Attrs                    = (IB_QP_ATTR_PORTGUID |
+                                             IB_QP_ATTR_PKEYINDEX |
+                                             IB_QP_ATTR_ACCESSCONTROL),
+                .PortGUID                 = kibnal_data.kib_port_guid,
+                .PkeyIndex                = 0,
+                .AccessControl = {
+                        .s = {
+                                .RdmaWrite = 1,
+                                .RdmaRead  = 1,
+                        },
+                },
+        };
+        rc = iibt_qp_modify(conn->ibc_qp, &params.qp_attr, NULL);
+        if (rc != 0) {
+                CERROR ("Failed to modify queue pair: %d\n", rc);
+                goto failed;
+        }
+
+        /* 1 ref for caller */
+        atomic_set (&conn->ibc_refcount, 1);
+        return (conn);
+        
+ failed:
+        kibnal_destroy_conn (conn);
+        return (NULL);
+}
+
+void
+kibnal_destroy_conn (kib_conn_t *conn)
+{
+        int    rc;
+        FSTATUS frc;
+        
+        CDEBUG (D_NET, "connection %p\n", conn);
+
+        LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+        LASSERT (list_empty(&conn->ibc_tx_queue));
+        LASSERT (list_empty(&conn->ibc_active_txs));
+        LASSERT (conn->ibc_nsends_posted == 0);
+        LASSERT (conn->ibc_connreq == NULL);
+
+        switch (conn->ibc_state) {
+        case IBNAL_CONN_DISCONNECTED:
+                /* called after connection sequence initiated */
+                /* fall through */
+
+        case IBNAL_CONN_INIT_QP:
+                /* _destroy includes an implicit Reset of the QP which 
+                 * discards posted work */
+                rc = iibt_qp_destroy(conn->ibc_qp);
+                if (rc != 0)
+                        CERROR("Can't destroy QP: %d\n", rc);
+                /* fall through */
+                
+        case IBNAL_CONN_INIT_NOTHING:
+                break;
+
+        default:
+                LASSERT (0);
+        }
+
+        if (conn->ibc_cep != NULL) {
+                frc = iibt_cm_destroy_cep(conn->ibc_cep);
+                if (frc != 0)
+                        CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, 
+                               frc);
+        }
+
+        if (conn->ibc_rx_pages != NULL) 
+                kibnal_free_pages(conn->ibc_rx_pages);
+        
+        if (conn->ibc_rxs != NULL)
+                PORTAL_FREE(conn->ibc_rxs, 
+                            IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+        if (conn->ibc_peer != NULL)
+                kib_peer_decref(conn->ibc_peer);
+
+        PORTAL_FREE(conn, sizeof (*conn));
+
+        atomic_dec(&kibnal_data.kib_nconns);
+        
+        if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+            kibnal_data.kib_shutdown) {
+                /* I just nuked the last connection on shutdown; wake up
+                 * everyone so they can exit. */
+                wake_up_all(&kibnal_data.kib_sched_waitq);
+                wake_up_all(&kibnal_data.kib_connd_waitq);
+        }
+}
+
+void
+kibnal_put_conn (kib_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
+                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                atomic_read (&conn->ibc_refcount));
+
+        LASSERT (atomic_read (&conn->ibc_refcount) > 0);
+        if (!atomic_dec_and_test (&conn->ibc_refcount))
+                return;
+
+        /* must disconnect before dropping the final ref */
+        LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
+
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+        list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+}
+
+static int
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+        kib_conn_t         *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                count++;
+                kibnal_close_conn_locked (conn, why);
+        }
+
+        return (count);
+}
+
+int
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
+{
+        kib_conn_t         *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                if (conn->ibc_incarnation == incarnation)
+                        continue;
+
+                CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
+                       peer->ibp_nid, conn->ibc_incarnation, incarnation);
+                
+                count++;
+                kibnal_close_conn_locked (conn, -ESTALE);
+        }
+
+        return (count);
+}
+
+static int
+kibnal_close_matching_conns (ptl_nid_t nid)
+{
+        unsigned long       flags;
+        kib_peer_t         *peer;
+        struct list_head   *ptmp;
+        struct list_head   *pnxt;
+        int                 lo;
+        int                 hi;
+        int                 i;
+        int                 count = 0;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        if (nid != PTL_NID_ANY)
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+        else {
+                lo = 0;
+                hi = kibnal_data.kib_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+                                continue;
+
+                        count += kibnal_close_peer_conns_locked (peer, 0);
+                }
+        }
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        /* wildcards always succeed */
+        if (nid == PTL_NID_ANY)
+                return (0);
+        
+        return (count == 0 ? -ENOENT : 0);
+}
+
+static int
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
+{
+        int rc = -EINVAL;
+        ENTRY;
+
+        LASSERT (pcfg != NULL);
+
+        switch(pcfg->pcfg_command) {
+        case NAL_CMD_GET_PEER: {
+                ptl_nid_t   nid = 0;
+                int         share_count = 0;
+
+                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                                          &nid, &share_count);
+                pcfg->pcfg_nid   = nid;
+                pcfg->pcfg_size  = 0;
+                pcfg->pcfg_id    = 0;
+                pcfg->pcfg_misc  = 0;
+                pcfg->pcfg_count = 0;
+                pcfg->pcfg_wait  = share_count;
+                break;
+        }
+        case NAL_CMD_ADD_PEER: {
+                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
+                break;
+        }
+        case NAL_CMD_DEL_PEER: {
+                rc = kibnal_del_peer (pcfg->pcfg_nid, 
+                                       /* flags == single_share */
+                                       pcfg->pcfg_flags != 0);
+                break;
+        }
+        case NAL_CMD_GET_CONN: {
+                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+
+                if (conn == NULL)
+                        rc = -ENOENT;
+                else {
+                        rc = 0;
+                        pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
+                        pcfg->pcfg_id    = 0;
+                        pcfg->pcfg_misc  = 0;
+                        pcfg->pcfg_flags = 0;
+                        kibnal_put_conn (conn);
+                }
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                if (pcfg->pcfg_nid == PTL_NID_ANY)
+                        rc = -EINVAL;
+                else
+                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
+                break;
+        }
+        }
+
+        RETURN(rc);
+}
+
+void
+kibnal_free_pages (kib_pages_t *p)
+{
+        int     npages = p->ibp_npages;
+        int     rc;
+        int     i;
+        
+        if (p->ibp_mapped) {
+                rc = iibt_deregister_memory(p->ibp_handle);
+                if (rc != 0)
+                        CERROR ("Deregister error: %d\n", rc);
+        }
+        
+        for (i = 0; i < npages; i++)
+                if (p->ibp_pages[i] != NULL)
+                        __free_page(p->ibp_pages[i]);
+        
+        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
+{
+        kib_pages_t                *p;
+        __u64                      *phys_pages;
+        int                         i;
+        FSTATUS                     frc;
+        IB_ACCESS_CONTROL           access;
+
+        memset(&access, 0, sizeof(access));
+        access.s.MWBindable = 1;
+        access.s.LocalWrite = 1;
+        access.s.RdmaRead = 1;
+        access.s.RdmaWrite = 1;
+
+        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+        if (p == NULL) {
+                CERROR ("Can't allocate buffer %d\n", npages);
+                return (-ENOMEM);
+        }
+
+        memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+        p->ibp_npages = npages;
+        
+        for (i = 0; i < npages; i++) {
+                p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+                if (p->ibp_pages[i] == NULL) {
+                        CERROR ("Can't allocate page %d of %d\n", i, npages);
+                        kibnal_free_pages(p);
+                        return (-ENOMEM);
+                }
+        }
+
+        if (kibnal_whole_mem())
+                goto out;
+
+        PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
+        if (phys_pages == NULL) {
+                CERROR ("Can't allocate physarray for %d pages\n", npages);
+                /* XXX free ibp_pages? */
+                kibnal_free_pages(p);
+                return (-ENOMEM);
+        }
+
+        /* if we were using the _contig_ registration variant we would have
+         * an array of PhysAddr/Length pairs, but the discontiguous variant
+         * just takes the PhysAddr */
+        for (i = 0; i < npages; i++)
+                phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
+
+        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+                                            0,          /* requested vaddr */
+                                            phys_pages, npages,
+                                            0,          /* offset */
+                                            kibnal_data.kib_pd,
+                                            access,
+                                            &p->ibp_handle, &p->ibp_vaddr,
+                                            &p->ibp_lkey, &p->ibp_rkey);
+        
+        PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
+        
+        if (frc != FSUCCESS) {
+                CERROR ("Error %d mapping %d pages\n", frc, npages);
+                kibnal_free_pages(p);
+                return (-ENOMEM);
+        }
+
+        CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
+                      "lkey %x rkey %x\n", npages, p->ibp_handle,
+                      p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
+        
+        p->ibp_mapped = 1;
+out:
+        *pp = p;
+        return (0);
+}
+
+static int
+kibnal_setup_tx_descs (void)
+{
+        int           ipage = 0;
+        int           page_offset = 0;
+        __u64         vaddr;
+        __u64         vaddr_base;
+        struct page  *page;
+        kib_tx_t     *tx;
+        int           i;
+        int           rc;
+
+        /* pre-mapped messages are not bigger than 1 page */
+        LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+
+        /* No fancy arithmetic when we do the buffer calculations */
+        LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+
+        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
+                                0);
+        if (rc != 0)
+                return (rc);
+
+        /* ignored for the whole_mem case */
+        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
+
+        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+                page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+                tx = &kibnal_data.kib_tx_descs[i];
+
+                memset (tx, 0, sizeof(*tx));    /* zero flags etc */
+                
+                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
+                                            page_offset);
+
+                if (kibnal_whole_mem()) 
+                        tx->tx_vaddr = kibnal_page2phys(page) + 
+                                       page_offset + 
+                                       kibnal_data.kib_md.md_addr;
+                else
+                        tx->tx_vaddr = vaddr;
+
+                tx->tx_isnblk = (i >= IBNAL_NTX);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+
+                CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
+                       i, tx, tx->tx_msg, tx->tx_vaddr);
+
+                if (tx->tx_isnblk)
+                        list_add (&tx->tx_list, 
+                                  &kibnal_data.kib_idle_nblk_txs);
+                else
+                        list_add (&tx->tx_list, 
+                                  &kibnal_data.kib_idle_txs);
+
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+
+                page_offset += IBNAL_MSG_SIZE;
+                LASSERT (page_offset <= PAGE_SIZE);
+
+                if (page_offset == PAGE_SIZE) {
+                        page_offset = 0;
+                        ipage++;
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+                }
+        }
+        
+        return (0);
+}
+
+static void
+kibnal_api_shutdown (nal_t *nal)
+{
+        int   i;
+        int   rc;
+
+        if (nal->nal_refct != 0) {
+                /* This module got the first ref */
+                PORTAL_MODULE_UNUSE;
+                return;
+        }
+
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        LASSERT(nal == &kibnal_api);
+
+        switch (kibnal_data.kib_init) {
+        default:
+                CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
+                LBUG();
+
+        case IBNAL_INIT_ALL:
+                /* stop calls to nal_cmd */
+                libcfs_nal_cmd_unregister(IIBNAL);
+                /* No new peers */
+
+                /* resetting my NID to unadvertises me, removes my
+                 * listener and nukes all current peers */
+                kibnal_set_mynid (PTL_NID_ANY);
+
+                /* Wait for all peer state to clean up (crazy) */
+                i = 2;
+                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "waiting for %d peers to disconnect (can take a few seconds)\n",
+                               atomic_read (&kibnal_data.kib_npeers));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+                /* fall through */
+
+        case IBNAL_INIT_CQ:
+                rc = iibt_cq_destroy(kibnal_data.kib_cq);
+                if (rc != 0)
+                        CERROR ("Destroy CQ error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_TXD:
+                kibnal_free_pages (kibnal_data.kib_tx_pages);
+                /* fall through */
+
+        case IBNAL_INIT_MR:
+                if (kibnal_data.kib_md.md_handle != NULL) {
+                        rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
+                        if (rc != FSUCCESS)
+                                CERROR ("Deregister memory: %d\n", rc);
+                }
+                /* fall through */
+
+#if IBNAL_FMR
+        case IBNAL_INIT_FMR:
+                rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
+                if (rc != 0)
+                        CERROR ("Destroy FMR pool error: %d\n", rc);
+                /* fall through */
+#endif
+        case IBNAL_INIT_PD:
+                rc = iibt_pd_free(kibnal_data.kib_pd);
+                if (rc != 0)
+                        CERROR ("Destroy PD error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_SD:
+                rc = iibt_sd_deregister(kibnal_data.kib_sd);
+                if (rc != 0)
+                        CERROR ("Deregister SD error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_PORT:
+                /* XXX ??? */
+                /* fall through */
+
+        case IBNAL_INIT_PORTATTRS:
+                PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
+                            kibnal_data.kib_hca_attrs.PortAttributesListSize);
+                /* fall through */
+
+        case IBNAL_INIT_HCA:
+                rc = iibt_close_hca(kibnal_data.kib_hca);
+                if (rc != 0)
+                        CERROR ("Close HCA  error: %d\n", rc);
+                /* fall through */
+
+        case IBNAL_INIT_LIB:
+                lib_fini(&kibnal_lib);
+                /* fall through */
+
+        case IBNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all peers
+                 * have been closed so all lists must be empty */
+                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (kibnal_data.kib_peers != NULL);
+                for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                        LASSERT (list_empty (&kibnal_data.kib_peers[i]));
+                }
+                LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+                LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+                LASSERT (list_empty (&kibnal_data.kib_connd_peers));
+
+                /* flag threads to terminate; wake and wait for them to die */
+                kibnal_data.kib_shutdown = 1;
+                wake_up_all (&kibnal_data.kib_sched_waitq);
+                wake_up_all (&kibnal_data.kib_connd_waitq);
+
+                i = 2;
+                while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "Waiting for %d threads to terminate\n",
+                               atomic_read (&kibnal_data.kib_nthreads));
+                        set_current_state (TASK_INTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+                /* fall through */
+                
+        case IBNAL_INIT_NOTHING:
+                break;
+        }
+
+        if (kibnal_data.kib_tx_descs != NULL)
+                PORTAL_FREE (kibnal_data.kib_tx_descs,
+                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
+
+        if (kibnal_data.kib_peers != NULL)
+                PORTAL_FREE (kibnal_data.kib_peers,
+                             sizeof (struct list_head) * 
+                             kibnal_data.kib_peer_hash_size);
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+        printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+
+        kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+}
+
+#define roundup_power(val, power) \
+        ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
+
+/* this isn't very portable or sturdy in the face of funny mem/bus configs */
+static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
+{
+        struct sysinfo si;
+        __u64 ret;
+
+        /* XXX we don't bother with first-gen cards */
+        if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
+                return 0ULL;
+
+        si_meminfo(&si);
+        ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
+        return roundup_power(ret, 128 * 1024 * 1024);
+} 
+#undef roundup_power
+
+static int
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+                     ptl_ni_limits_t *requested_limits,
+                     ptl_ni_limits_t *actual_limits)
+{
+        ptl_process_id_t    process_id;
+        int                 pkmem = atomic_read(&portal_kmemory);
+        IB_PORT_ATTRIBUTES *pattr;
+        FSTATUS             frc;
+        int                 rc;
+        int                 n;
+        int                 i;
+
+        LASSERT (nal == &kibnal_api);
+
+        if (nal->nal_refct != 0) {
+                if (actual_limits != NULL)
+                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
+                /* This module got the first ref */
+                PORTAL_MODULE_USE;
+                return (PTL_OK);
+        }
+
+        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+
+        frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, 
+                                       &kibnal_data.kib_interfaces);
+        if (frc != FSUCCESS) {
+                CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
+                        frc);
+                return -ENOSYS;
+        }
+
+        init_MUTEX (&kibnal_data.kib_nid_mutex);
+        init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+        kibnal_data.kib_nid = PTL_NID_ANY;
+
+        rwlock_init(&kibnal_data.kib_global_lock);
+
+        kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+        PORTAL_ALLOC (kibnal_data.kib_peers,
+                      sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+        if (kibnal_data.kib_peers == NULL) {
+                goto failed;
+        }
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+                INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+        spin_lock_init (&kibnal_data.kib_connd_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+        init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+        spin_lock_init (&kibnal_data.kib_sched_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+        init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+        spin_lock_init (&kibnal_data.kib_tx_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        if (kibnal_data.kib_tx_descs == NULL) {
+                CERROR ("Can't allocate tx descs\n");
+                goto failed;
+        }
+
+        /* lists/ptrs/locks initialised */
+        kibnal_data.kib_init = IBNAL_INIT_DATA;
+        /*****************************************************/
+
+        process_id.pid = 0;
+        process_id.nid = kibnal_data.kib_nid;
+        
+        rc = lib_init(&kibnal_lib, nal, process_id,
+                      requested_limits, actual_limits);
+        if (rc != PTL_OK) {
+                CERROR("lib_init failed: error %d\n", rc);
+                goto failed;
+        }
+
+        /* lib interface initialised */
+        kibnal_data.kib_init = IBNAL_INIT_LIB;
+        /*****************************************************/
+
+        for (i = 0; i < IBNAL_N_SCHED; i++) {
+                rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
+                if (rc != 0) {
+                        CERROR("Can't spawn iibnal scheduler[%d]: %d\n",
+                               i, rc);
+                        goto failed;
+                }
+        }
+
+        rc = kibnal_thread_start (kibnal_connd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't spawn iibnal connd: %d\n", rc);
+                goto failed;
+        }
+
+        n = sizeof(kibnal_data.kib_hca_guids) /
+            sizeof(kibnal_data.kib_hca_guids[0]);
+        frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't get channel adapter guids: %d\n", frc);
+                goto failed;
+        }
+        if (n == 0) {
+                CERROR ("No channel adapters found\n");
+                goto failed;
+        }
+
+        /* Infinicon has per-HCA rather than per CQ completion handlers */
+        frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
+                            kibnal_ca_callback,
+                            kibnal_ca_async_callback,
+                            &kibnal_data.kib_hca,
+                            &kibnal_data.kib_hca);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't open CA[0]: %d\n", frc);
+                goto failed;
+        }
+        
+        /* Channel Adapter opened */
+        kibnal_data.kib_init = IBNAL_INIT_HCA;
+        /*****************************************************/
+
+        kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
+        kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
+        frc = iibt_query_hca(kibnal_data.kib_hca,
+                             &kibnal_data.kib_hca_attrs, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't size port attrs: %d\n", frc);
+                goto failed;
+        }
+        
+        PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
+                     kibnal_data.kib_hca_attrs.PortAttributesListSize);
+        if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
+                goto failed;
+
+        /* Port attrs allocated */
+        kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
+        /*****************************************************/
+        
+        frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
+                             NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't get port attrs for CA 0: %d\n", frc);
+                goto failed;
+        }
+
+        for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
+             pattr != NULL;
+             i++, pattr = pattr->Next) {
+                switch (pattr->PortState) {
+                default:
+                        CERROR("Unexpected port[%d] state %d\n",
+                               i, pattr->PortState);
+                        continue;
+                case PortStateDown:
+                        CDEBUG(D_NET, "port[%d] Down\n", i);
+                        continue;
+                case PortStateInit:
+                        CDEBUG(D_NET, "port[%d] Init\n", i);
+                        continue;
+                case PortStateArmed:
+                        CDEBUG(D_NET, "port[%d] Armed\n", i);
+                        continue;
+                        
+                case PortStateActive:
+                        CDEBUG(D_NET, "port[%d] Active\n", i);
+                        kibnal_data.kib_port = i;
+                        kibnal_data.kib_port_guid = pattr->GUID;
+                        kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
+                        break;
+                }
+                break;
+        }
+
+        if (pattr == NULL) {
+                CERROR ("Can't find an active port\n");
+                goto failed;
+        }
+
+        CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
+        
+        /* Active port found */
+        kibnal_data.kib_init = IBNAL_INIT_PORT;
+        /*****************************************************/
+
+        frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't register with SD: %d\n", frc);
+                goto failed;
+        }
+        
+        /* Registered with SD OK */
+        kibnal_data.kib_init = IBNAL_INIT_SD;
+        /*****************************************************/
+
+        frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't create PD: %d\n", rc);
+                goto failed;
+        }
+        
+        /* flag PD initialised */
+        kibnal_data.kib_init = IBNAL_INIT_PD;
+        /*****************************************************/
+
+#if IBNAL_FMR
+        {
+                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
+                struct ib_fmr_pool_param params = {
+                        .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
+                        .access            = (IB_ACCESS_LOCAL_WRITE |
+                                              IB_ACCESS_REMOTE_WRITE |
+                                              IB_ACCESS_REMOTE_READ),
+                        .pool_size         = pool_size,
+                        .dirty_watermark   = (pool_size * 3)/4,
+                        .flush_function    = NULL,
+                        .flush_arg         = NULL,
+                        .cache             = 1,
+                };
+                rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
+                                        &kibnal_data.kib_fmr_pool);
+                if (rc != 0) {
+                        CERROR ("Can't create FMR pool size %d: %d\n", 
+                                pool_size, rc);
+                        goto failed;
+                }
+        }
+
+        /* flag FMR pool initialised */
+        kibnal_data.kib_init = IBNAL_INIT_FMR;
+#endif
+        /*****************************************************/
+        if (IBNAL_WHOLE_MEM) {
+                IB_MR_PHYS_BUFFER phys;
+                IB_ACCESS_CONTROL access;
+                kib_md_t *md = &kibnal_data.kib_md;
+
+                memset(&access, 0, sizeof(access));
+                access.s.MWBindable = 1;
+                access.s.LocalWrite = 1;
+                access.s.RdmaRead = 1;
+                access.s.RdmaWrite = 1;
+
+                phys.PhysAddr = 0;
+                phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
+                if (phys.Length == 0) {
+                        CERROR ("couldn't determine the end of phys mem\n");
+                        goto failed;
+                }
+       
+                rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
+                                                          0,
+                                                          &phys, 1,
+                                                          0,
+                                                          kibnal_data.kib_pd,
+                                                          access,
+                                                          &md->md_handle,
+                                                          &md->md_addr,
+                                                          &md->md_lkey,
+                                                          &md->md_rkey);
+                if (rc != FSUCCESS) {
+                        CERROR("registering physical memory failed: %d\n", 
+                               rc);
+                        CERROR("falling back to registration per-rdma\n");
+                        md->md_handle = NULL;
+                } else {
+                        CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
+                               phys.Length);
+                        kibnal_data.kib_init = IBNAL_INIT_MR;
+                }
+        }
+
+        /*****************************************************/
+
+        rc = kibnal_setup_tx_descs();
+        if (rc != 0) {
+                CERROR ("Can't register tx descs: %d\n", rc);
+                goto failed;
+        }
+        
+        /* flag TX descs initialised */
+        kibnal_data.kib_init = IBNAL_INIT_TXD;
+        /*****************************************************/
+        
+        {
+                uint32 nentries;
+
+                frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
+                                     &kibnal_data.kib_cq, &kibnal_data.kib_cq,
+                                     &nentries);
+                if (frc != FSUCCESS) {
+                        CERROR ("Can't create RX CQ: %d\n", frc);
+                        goto failed;
+                }
+
+                /* flag CQ initialised */
+                kibnal_data.kib_init = IBNAL_INIT_CQ;
+
+                if (nentries < IBNAL_CQ_ENTRIES) {
+                        CERROR ("CQ only has %d entries, need %d\n", 
+                                nentries, IBNAL_CQ_ENTRIES);
+                        goto failed;
+                }
+
+                rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
+                if (rc != 0) {
+                        CERROR ("Failed to re-arm completion queue: %d\n", rc);
+                        goto failed;
+                }
+        }
+        
+        /*****************************************************/
+
+        rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                goto failed;
+        }
+
+        /* flag everything initialised */
+        kibnal_data.kib_init = IBNAL_INIT_ALL;
+        /*****************************************************/
+
+        printk(KERN_INFO "Lustre: Infinicon IB NAL loaded "
+               "(initial mem %d)\n", pkmem);
+
+        return (PTL_OK);
+
+ failed:
+        kibnal_api_shutdown (&kibnal_api);    
+        return (PTL_FAIL);
+}
+
+void __exit
+kibnal_module_fini (void)
+{
+#ifdef CONFIG_SYSCTL
+        if (kibnal_tunables.kib_sysctl != NULL)
+                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
+#endif
+        PtlNIFini(kibnal_ni);
+
+        ptl_unregister_nal(IIBNAL);
+}
+
+int __init
+kibnal_module_init (void)
+{
+        int    rc;
+
+        if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
+                CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
+                return -EINVAL;
+        }
+
+        /* the following must be sizeof(int) for proc_dointvec() */
+        if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
+                CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
+                return -EINVAL;
+        }
+
+        kibnal_api.nal_ni_init = kibnal_api_startup;
+        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
+
+        /* Initialise dynamic tunables to defaults once only */
+        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+
+        rc = ptl_register_nal(IIBNAL, &kibnal_api);
+        if (rc != PTL_OK) {
+                CERROR("Can't register IBNAL: %d\n", rc);
+                return (-ENOMEM);               /* or something... */
+        }
+
+        /* Pure gateways want the NAL started up at module load time... */
+        rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni);
+        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+                ptl_unregister_nal(IIBNAL);
+                return (-ENODEV);
+        }
+        
+#ifdef CONFIG_SYSCTL
+        /* Press on regardless even if registering sysctl doesn't work */
+        kibnal_tunables.kib_sysctl = 
+                register_sysctl_table (kibnal_top_ctl_table, 0);
+#endif
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
+
diff --git a/lustre/portals/knals/iibnal/iibnal.h b/lustre/portals/knals/iibnal/iibnal.h
new file mode 100644 (file)
index 0000000..0a25a9a
--- /dev/null
@@ -0,0 +1,892 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#define DEBUG_SUBSYSTEM S_IBNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/nal.h>
+
+#include <linux/iba/ibt.h>
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+                + __GNUC_MINOR__ * 100 \
+                + __GNUC_PATCHLEVEL__)
+
+/* Test for GCC > 3.2.2 */
+#if GCC_VERSION <= 30202
+/* GCC 3.2.2, and presumably several versions before it, will
+ * miscompile this driver. See
+ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */
+#error Invalid GCC version. Must use GCC >= 3.2.3
+#endif
+
+#define IBNAL_SERVICE_NAME   "iibnal"
+#define IBNAL_SERVICE_NUMBER 0x11b9a1
+
+#if CONFIG_SMP
+# define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
+#else
+# define IBNAL_N_SCHED      1                   /* # schedulers */
+#endif
+
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
+
+#define IBNAL_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
+
+#define IBNAL_MSG_QUEUE_SIZE   8                /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 7                /* when to eagerly return credits */
+/* 7 indicates infinite retry attempts, Infinicon recommended 5 */
+#define IBNAL_RETRY            5                /* # times to retry */
+#define IBNAL_RNR_RETRY        5                /*  */
+#define IBNAL_CM_RETRY         5                /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL     1
+#define IBNAL_ACK_TIMEOUT       20              /* supposedly 4 secs */
+
+#define IBNAL_NTX             64                /* # tx descs */
+/* this had to be dropped down so that we only register < 255 pages per
+ * region.  this will change if we register all memory. */
+#define IBNAL_NTX_NBLK        128               /* # reserved tx descs */
+
+#define IBNAL_PEER_HASH_SIZE  101               /* # peer lists */
+
+#define IBNAL_RESCHED         100               /* # scheduler loops before reschedule */
+
+#define IBNAL_CONCURRENT_PEERS 1000             /* # nodes all talking at once to me */
+
+/* default vals for runtime tunables */
+#define IBNAL_IO_TIMEOUT      50                /* default comms timeout (seconds) */
+
+/************************/
+/* derived constants... */
+
+/* TX messages (shared by all connections) */
+#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1)
+
+/* RX messages (per connection) */
+#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+
+/* we may have up to 2 completions per transmit +
+   1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
+                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
+
+#define IBNAL_RDMA_BASE  0x0eeb0000
+#define IBNAL_FMR        0
+#define IBNAL_WHOLE_MEM  1
+#define IBNAL_CKSUM      0
+//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
+
+/* XXX I have no idea. */
+#define IBNAL_STARTING_PSN 1
+
+typedef struct 
+{
+        int               kib_io_timeout;       /* comms timeout (seconds) */
+        struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+} kib_tunables_t;
+
+/* some of these have specific types in the stack that just map back
+ * to the uFOO types, like IB_{L,R}_KEY. */
+typedef struct
+{
+        int               ibp_npages;           /* # pages */
+        int               ibp_mapped;           /* mapped? */
+        __u64             ibp_vaddr;            /* mapped region vaddr */
+        __u32             ibp_lkey;             /* mapped region lkey */
+        __u32             ibp_rkey;             /* mapped region rkey */
+        IB_HANDLE         ibp_handle;           /* mapped region handle */
+        struct page      *ibp_pages[0];
+} kib_pages_t;
+
+typedef struct
+{
+        IB_HANDLE         md_handle;
+        __u32             md_lkey;
+        __u32             md_rkey;
+        __u64             md_addr;
+} kib_md_t __attribute__((packed));
+        
+typedef struct 
+{
+        int               kib_init;             /* initialisation state */
+        __u64             kib_incarnation;      /* which one am I */
+        int               kib_shutdown;         /* shut down? */
+        atomic_t          kib_nthreads;         /* # live threads */
+
+        __u64             kib_service_id;       /* service number I listen on */
+        __u64             kib_port_guid;        /* my GUID (lo 64 of GID)*/
+        __u16             kib_port_pkey;        /* my pkey, whatever that is */
+        ptl_nid_t         kib_nid;              /* my NID */
+        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
+        struct semaphore  kib_nid_signal;       /* signal completion */
+        IB_HANDLE         kib_cep;              /* connection end point */
+
+        rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
+
+        struct list_head *kib_peers;            /* hash table of all my known peers */
+        int               kib_peer_hash_size;   /* size of kib_peers */
+        atomic_t          kib_npeers;           /* # peers extant */
+        atomic_t          kib_nconns;           /* # connections extant */
+
+        struct list_head  kib_connd_conns;      /* connections to progress */
+        struct list_head  kib_connd_peers;      /* peers waiting for a connection */
+        wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
+        unsigned long     kib_connd_waketime;   /* when connd will wake */
+        spinlock_t        kib_connd_lock;       /* serialise */
+
+        wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
+        struct list_head  kib_sched_txq;        /* tx requiring attention */
+        struct list_head  kib_sched_rxq;        /* rx requiring attention */
+        spinlock_t        kib_sched_lock;       /* serialise */
+        
+        struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
+        kib_pages_t      *kib_tx_pages;         /* premapped tx msg pages */
+
+        struct list_head  kib_idle_txs;         /* idle tx descriptors */
+        struct list_head  kib_idle_nblk_txs;    /* idle reserved tx descriptors */
+        wait_queue_head_t kib_idle_tx_waitq;    /* block here for tx descriptor */
+        __u64             kib_next_tx_cookie;   /* RDMA completion cookie */
+        spinlock_t        kib_tx_lock;          /* serialise */
+        
+        IB_HANDLE         kib_hca;              /* The HCA */
+        int               kib_port;             /* port on the device */
+        IB_HANDLE         kib_pd;               /* protection domain */
+        IB_HANDLE         kib_sd;               /* SD handle */
+        IB_HANDLE         kib_cq;               /* completion queue */
+        kib_md_t          kib_md;               /* full-mem registration */
+
+        void             *kib_listen_handle;    /* where I listen for connections */
+
+        IBT_INTERFACE_UNION kib_interfaces;     /* The Infinicon IBT interface */
+
+        uint64              kib_hca_guids[8];   /* all the HCA guids */
+        IB_CA_ATTRIBUTES    kib_hca_attrs;      /* where to get HCA attrs */
+        FABRIC_OPERATION_DATA kib_fabopdata;    /* (un)advertise service record */
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING         0
+#define IBNAL_INIT_DATA            1
+#define IBNAL_INIT_LIB             2
+#define IBNAL_INIT_HCA             3
+#define IBNAL_INIT_PORTATTRS       4
+#define IBNAL_INIT_PORT            5
+#define IBNAL_INIT_SD              6
+#define IBNAL_INIT_PD              7
+#define IBNAL_INIT_FMR             8
+#define IBNAL_INIT_MR              9
+#define IBNAL_INIT_TXD             10 
+#define IBNAL_INIT_CQ              11 
+#define IBNAL_INIT_ALL             12 
+
+/************************************************************************
+ * Wire message structs.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD
+ * private data and SM service info), is LE on the wire.
+ */
+
+/* also kib_md_t above */
+
+typedef struct
+{
+        __u32                 rd_key;           /* remote key */
+        __u32                 rd_nob;           /* # of bytes */
+        __u64                 rd_addr;          /* remote io vaddr */
+} kib_rdma_desc_t __attribute__((packed));
+
+typedef struct
+{
+        ptl_hdr_t         ibim_hdr;             /* portals header */
+        char              ibim_payload[0];      /* piggy-backed payload */
+} kib_immediate_msg_t __attribute__((packed));
+
+/* these arrays serve two purposes during rdma.  they are built on the passive
+ * side and sent to the active side as remote arguments.  On the active side
+ * the descs are used as a data structure on the way to local gather items. 
+ * the different roles result in split local/remote meaning of desc->rd_key */
+typedef struct
+{
+        ptl_hdr_t         ibrm_hdr;             /* portals header */
+        __u64             ibrm_cookie;          /* opaque completion cookie */
+        __u32             ibrm_num_descs;       /* how many descs */
+        kib_rdma_desc_t   ibrm_desc[0];         /* where to suck/blow */
+} kib_rdma_msg_t __attribute__((packed));
+
+#define kib_rdma_msg_len(num_descs) \
+        offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs])
+
+typedef struct
+{
+        __u64             ibcm_cookie;          /* opaque completion cookie */
+        __u32             ibcm_status;          /* completion status */
+} kib_completion_msg_t __attribute__((packed));
+
+typedef struct
+{
+        __u32              ibm_magic;           /* I'm an openibnal message */
+        __u16              ibm_version;         /* this is my version number */
+        __u8               ibm_type;            /* msg type */
+        __u8               ibm_credits;         /* returned credits */
+#if IBNAL_CKSUM
+        __u32              ibm_nob;
+        __u32              ibm_cksum;
+#endif
+        union {
+                kib_immediate_msg_t   immediate;
+                kib_rdma_msg_t        rdma;
+                kib_completion_msg_t  completion;
+        } ibm_u __attribute__((packed));
+} kib_msg_t __attribute__((packed));
+
+#define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
+#define IBNAL_MSG_VERSION              1        /* current protocol version */
+
+#define IBNAL_MSG_NOOP              0xd0        /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE         0xd1        /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA          0xd2        /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE          0xd3        /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA          0xd4        /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE          0xd5        /* signal GET rdma completion */
+
+/***********************************************************************/
+
+typedef struct kib_rx                           /* receive message */
+{
+        struct list_head          rx_list;      /* queue for attention */
+        struct kib_conn          *rx_conn;      /* owning conn */
+        int                       rx_rdma;      /* RDMA completion posted? */
+        int                       rx_posted;    /* posted? */
+        __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+        kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
+        IB_WORK_REQ               rx_wrq;
+        IB_LOCAL_DATASEGMENT      rx_gl;        /* and it's memory */
+} kib_rx_t;
+
+typedef struct kib_tx                           /* transmit message */
+{
+        struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+        int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
+        struct kib_conn          *tx_conn;      /* owning conn */
+        int                       tx_mapped;    /* mapped for RDMA? */
+        int                       tx_sending;   /* # tx callbacks outstanding */
+        int                       tx_status;    /* completion status */
+        unsigned long             tx_deadline;  /* completion deadline */
+        int                       tx_passive_rdma; /* peer sucks/blows */
+        int                       tx_passive_rdma_wait; /* waiting for peer to complete */
+        __u64                     tx_passive_rdma_cookie; /* completion cookie */
+        lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
+        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
+        __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+        kib_msg_t                *tx_msg;       /* pre-mapped buffer (host vaddr) */
+        int                       tx_nsp;       /* # send work items */
+        IB_WORK_REQ               tx_wrq[IBNAL_TX_MAX_SG];    /* send work items... */
+        IB_LOCAL_DATASEGMENT      tx_gl[IBNAL_TX_MAX_SG];     /* ...and their memory */
+} kib_tx_t;
+
+#define KIB_TX_UNMAPPED       0
+#define KIB_TX_MAPPED         1
+#define KIB_TX_MAPPED_FMR     2
+
+typedef struct kib_wire_connreq
+{
+        __u32        wcr_magic;                 /* I'm an openibnal connreq */
+        __u16        wcr_version;               /* this is my version number */
+        __u16        wcr_queue_depth;           /* this is my receive queue size */
+        __u64        wcr_nid;                   /* peer's NID */
+        __u64        wcr_incarnation;           /* peer's incarnation */
+} kib_wire_connreq_t;
+
+typedef struct kib_gid
+{
+        __u64   hi, lo;
+} kib_gid_t;
+
+typedef struct kib_connreq
+{
+        /* connection-in-progress */
+        struct kib_conn                    *cr_conn;
+        kib_wire_connreq_t                  cr_wcr;
+        __u64                               cr_tid;
+        IB_SERVICE_RECORD                   cr_service;
+        kib_gid_t                           cr_gid;
+        IB_PATH_RECORD                      cr_path;
+        CM_REQUEST_INFO                     cr_cmreq;
+        CM_CONN_INFO                        cr_discarded;
+        CM_REJECT_INFO                      cr_rej_info;
+} kib_connreq_t;
+
+typedef struct kib_conn
+{ 
+        struct kib_peer    *ibc_peer;           /* owning peer */
+        struct list_head    ibc_list;           /* stash on peer's conn list */
+        __u64               ibc_incarnation;    /* which instance of the peer */
+        atomic_t            ibc_refcount;       /* # users */
+        int                 ibc_state;          /* what's happening */
+        atomic_t            ibc_nob;            /* # bytes buffered */
+        int                 ibc_nsends_posted;  /* # uncompleted sends */
+        int                 ibc_credits;        /* # credits I have */
+        int                 ibc_outstanding_credits; /* # credits to return */
+        int                 ibc_rcvd_disconnect;/* received discon request */
+        int                 ibc_sent_disconnect;/* sent discon request */
+        struct list_head    ibc_tx_queue;       /* send queue */
+        struct list_head    ibc_active_txs;     /* active tx awaiting completion */
+        spinlock_t          ibc_lock;           /* serialise */
+        kib_rx_t           *ibc_rxs;            /* the rx descs */
+        kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
+        IB_HANDLE           ibc_qp;             /* queue pair */
+        IB_HANDLE           ibc_cep;            /* connection ID? */
+        IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs;    /* QP attrs */
+        kib_connreq_t      *ibc_connreq;        /* connection request state */
+} kib_conn_t;
+
+#define IBNAL_CONN_INIT_NOTHING      0          /* initial state */
+#define IBNAL_CONN_INIT_QP           1          /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING        2          /* started to connect */
+#define IBNAL_CONN_ESTABLISHED       3          /* connection established */
+#define IBNAL_CONN_SEND_DREQ         4          /* to send disconnect req */
+#define IBNAL_CONN_DREQ              5          /* sent disconnect req */
+#define IBNAL_CONN_DREP              6          /* sent disconnect rep */
+#define IBNAL_CONN_DISCONNECTED      7          /* no more QP or CM traffic */
+
+#define KIB_ASSERT_CONN_STATE(conn, state) do {                         \
+        LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state);  \
+} while (0)
+
+#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do {               \
+        LASSERTF(low <= high, "%d %d\n", low, high);                    \
+        LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \
+                 "%d\n", conn->ibc_state);                              \
+} while (0)
+
+typedef struct kib_peer
+{
+        struct list_head    ibp_list;           /* stash on global peer list */
+        struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
+        ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
+        atomic_t            ibp_refcount;       /* # users */
+        int                 ibp_persistence;    /* "known" peer refs */
+        struct list_head    ibp_conns;          /* all active connections */
+        struct list_head    ibp_tx_queue;       /* msgs waiting for a conn */
+        int                 ibp_connecting;     /* connecting+accepting */
+        unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
+        unsigned long       ibp_reconnect_interval; /* exponential backoff */
+} kib_peer_t;
+
+
+extern lib_nal_t       kibnal_lib;
+extern kib_data_t      kibnal_data;
+extern kib_tunables_t  kibnal_tunables;
+
+/******************************************************************************/
+/* Infinicon IBT interface wrappers */
+#define IIBT_IF (kibnal_data.kib_interfaces.ver2)
+
+static inline FSTATUS
+iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list)
+{
+        return IIBT_IF.GetCaGuids(hca_count, hca_guid_list);
+}
+
+static inline FSTATUS
+iibt_open_hca(EUI64                    hca_guid, 
+             IB_COMPLETION_CALLBACK   completion_callback,
+             IB_ASYNC_EVENT_CALLBACK  async_event_callback,
+             void                    *arg,
+             IB_HANDLE               *handle)
+{
+        return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback,
+                                  async_event_callback, arg, handle);
+}
+
+static inline FSTATUS
+iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp)
+{
+        return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp);
+}
+
+static inline FSTATUS
+iibt_close_hca(IB_HANDLE hca_handle)
+{
+        return IIBT_IF.Vpi.CloseCA(hca_handle);
+}
+
+static inline FSTATUS
+iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle)
+{
+        return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle);
+}
+
+static inline FSTATUS
+iibt_pd_free(IB_HANDLE pd_handle)
+{
+        return IIBT_IF.Vpi.FreePD(pd_handle);
+}
+
+static inline FSTATUS
+iibt_register_physical_memory(IB_HANDLE hca_handle, 
+                              IB_VIRT_ADDR requested_io_va,
+                              void *phys_buffers, uint64 nphys_buffers,
+                              uint32 io_va_offset, IB_HANDLE pd_handle,
+                              IB_ACCESS_CONTROL access,
+                              IB_HANDLE *mem_handle, 
+                              IB_VIRT_ADDR *actual_io_va,
+                              IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+        return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va,
+                                                 phys_buffers, nphys_buffers,
+                                                 io_va_offset, pd_handle, 
+                                                 access,
+                                                 mem_handle, actual_io_va,
+                                                 lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_contig_physical_memory(IB_HANDLE hca_handle, 
+                                     IB_VIRT_ADDR requested_io_va,
+                                     IB_MR_PHYS_BUFFER *phys_buffers, 
+                                     uint64 nphys_buffers,
+                                     uint32 io_va_offset, IB_HANDLE pd_handle,
+                                     IB_ACCESS_CONTROL access,
+                                     IB_HANDLE *mem_handle, 
+                                     IB_VIRT_ADDR *actual_io_va,
+                                     IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+        return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle, 
+                                                       requested_io_va,
+                                                       phys_buffers, 
+                                                       nphys_buffers,
+                                                       io_va_offset, pd_handle, 
+                                                       access,
+                                                       mem_handle, actual_io_va,
+                                                       lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_memory(IB_HANDLE hca_handle, 
+                     void *virt_addr, unsigned int length,
+                     IB_HANDLE pd_handle,
+                     IB_ACCESS_CONTROL access,
+                     IB_HANDLE *mem_handle, 
+                     IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+        return IIBT_IF.Vpi.RegisterMemRegion(hca_handle, 
+                                             virt_addr, length,
+                                             pd_handle, 
+                                             access,
+                                             mem_handle,
+                                             lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_deregister_memory(IB_HANDLE mem_handle)
+{
+        return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle);
+}
+
+static inline FSTATUS
+iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size,
+              void *arg, IB_HANDLE *cq_handle, uint32 *actual_size)
+{
+        return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size,
+                                   arg, cq_handle, actual_size);
+}
+
+static inline FSTATUS
+iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc)
+{
+        return IIBT_IF.Vpi.PollCQ(cq_handle, wc);
+}
+
+static inline FSTATUS
+iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select)
+{
+        return IIBT_IF.Vpi.RearmCQ(cq_handle, select);
+}
+
+static inline FSTATUS
+iibt_cq_destroy(IB_HANDLE cq_handle)
+{
+        return IIBT_IF.Vpi.DestroyCQ(cq_handle);
+}
+
+static inline FSTATUS
+iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr,
+              void *arg, IB_HANDLE *cq_handle, 
+              IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+        return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle, 
+                                    query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr,
+              void **arg_ptr)
+{
+        return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr);
+}
+
+static inline FSTATUS
+iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr,
+               IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+        return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_destroy(IB_HANDLE qp_handle)
+{
+        return IIBT_IF.Vpi.DestroyQP(qp_handle);
+}
+
+static inline FSTATUS
+iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+        return IIBT_IF.Vpi.PostRecv(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+        return IIBT_IF.Vpi.PostSend(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p)
+{
+        return IIBT_IF.Sdi.Register(sd_handle, p);
+}
+
+static inline FSTATUS
+iibt_sd_deregister(IB_HANDLE sd_handle)
+{
+        return IIBT_IF.Sdi.Deregister(sd_handle);
+}
+
+static inline FSTATUS
+iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid,
+                              FABRIC_OPERATION_DATA *fod,
+                              PFABRIC_OPERATION_CALLBACK callback,
+                              COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+        return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid,
+                                               fod, callback, p, arg);
+}
+
+static inline FSTATUS
+iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid,
+                                      QUERY *qry,
+                                      PQUERY_CALLBACK callback,
+                                      COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+        return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid,
+                                                      qry, callback, p, arg);
+}
+
+static inline IB_HANDLE
+iibt_cm_create_cep(CM_CEP_TYPE type)
+{
+        return IIBT_IF.Cmi.CmCreateCEP(type);
+}
+
+static inline FSTATUS
+iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len,
+                   uint32 offset)
+{
+        return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset);
+}
+
+static inline FSTATUS
+iibt_cm_destroy_cep(IB_HANDLE cep_handle)
+{
+        return IIBT_IF.Cmi.CmDestroyCEP(cep_handle);
+}
+
+static inline FSTATUS
+iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info,
+               PFN_CM_CALLBACK callback, void *arg)
+{
+        return IIBT_IF.Cmi.CmListen(cep, info, callback, arg);
+}
+
+static inline FSTATUS
+iibt_cm_cancel(IB_HANDLE cep)
+{
+        return IIBT_IF.Cmi.CmCancel(cep);
+}
+
+static inline FSTATUS
+iibt_cm_accept(IB_HANDLE cep, 
+               CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info,
+               PFN_CM_CALLBACK callback, void *arg,
+               IB_HANDLE *new_cep)
+{
+        return IIBT_IF.Cmi.CmAccept(cep,
+                                    send_info, recv_info,
+                                    callback, arg, new_cep);
+}
+
+static inline FSTATUS
+iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej)
+{
+        return IIBT_IF.Cmi.CmReject(cep, rej);
+}
+
+static inline FSTATUS
+iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req,
+                   CM_DREPLY_INFO *reply)
+{
+        return IIBT_IF.Cmi.CmDisconnect(cep, req, reply);
+}
+
+static inline FSTATUS
+iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req,
+                 PFN_CM_CALLBACK callback, void *arg)
+{
+        return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg);
+}
+
+static inline int wrq_signals_completion(IB_WORK_REQ *wrq)
+{
+        return wrq->Req.SendRC.Options.s.SignaledCompletion == 1;
+}
+
+
+/******************************************************************************/
+
+/* these are purposely avoiding using local vars so they don't increase
+ * stack consumption. */
+
+#define kib_peer_addref(peer) do {                                      \
+        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
+                 atomic_read(&peer->ibp_refcount));                     \
+        CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n",                   \
+               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+        atomic_inc(&peer->ibp_refcount);                                \
+} while (0)
+
+#define kib_peer_decref(peer) do {                                      \
+        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
+                 atomic_read(&peer->ibp_refcount));                     \
+        CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n",                   \
+               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+        if (atomic_dec_and_test (&peer->ibp_refcount)) {                \
+                CDEBUG (D_NET, "destroying peer "LPX64" %p\n",          \
+                        peer->ibp_nid, peer);                           \
+                kibnal_destroy_peer (peer);                             \
+        }                                                               \
+} while (0)
+
+/******************************************************************************/
+
+static inline struct list_head *
+kibnal_nid2peerlist (ptl_nid_t nid) 
+{
+        unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
+        
+        return (&kibnal_data.kib_peers [hash]);
+}
+
+static inline int
+kibnal_peer_active(kib_peer_t *peer)
+{
+        /* Am I in the peer hash table? */
+        return (!list_empty(&peer->ibp_list));
+}
+
+static inline void
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+        /* CAVEAT EMPTOR: tx takes caller's ref on conn */
+
+        LASSERT (tx->tx_nsp > 0);               /* work items set up */
+        LASSERT (tx->tx_conn == NULL);          /* only set here */
+
+        tx->tx_conn = conn;
+        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
+        list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+}
+
+#define KIBNAL_SERVICE_KEY_MASK  (IB_SERVICE_RECORD_COMP_SERVICENAME |          \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 |       \
+                                  IB_SERVICE_RECORD_COMP_SERVICEDATA8_8)
+
+static inline __u64*
+kibnal_service_nid_field(IB_SERVICE_RECORD *srv)
+{
+        /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
+        return (__u64 *)srv->ServiceData8;
+}
+
+
+static inline void
+kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid)
+{
+        LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName));
+        memset (srv->ServiceName, 0, sizeof(srv->ServiceName));
+        strcpy (srv->ServiceName, IBNAL_SERVICE_NAME);
+
+        *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
+}
+
+#if 0
+static inline void
+kibnal_show_rdma_attr (kib_conn_t *conn)
+{
+        struct ib_qp_attribute qp_attr;
+        int                    rc;
+        
+        memset (&qp_attr, 0, sizeof(qp_attr));
+        rc = ib_qp_query(conn->ibc_qp, &qp_attr);
+        if (rc != 0) {
+                CERROR ("Can't get qp attrs: %d\n", rc);
+                return;
+        }
+        
+        CWARN ("RDMA CAPABILITY: write %s read %s\n",
+               (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+               (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid",
+               (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+               (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid");
+}
+#endif
+
+#if CONFIG_X86
+static inline __u64
+kibnal_page2phys (struct page *p)
+{
+        __u64 page_number = p - mem_map;
+        
+        return (page_number << PAGE_SHIFT);
+}
+#else
+# error "no page->phys"
+#endif
+
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive.  It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+        unsigned long lptr = (unsigned long)ptr;
+
+        LASSERT ((lptr & 1) == 0);
+        return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+        return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+        return (wreqid & 1) != 0;
+}
+
+static inline int
+kibnal_whole_mem(void)
+{
+        return kibnal_data.kib_md.md_handle != NULL;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_destroy_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer, 
+                                              __u64 incarnation);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg);
+
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
+
+extern void kibnal_check_sends (kib_conn_t *conn);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int  kibnal_scheduler(void *arg);
+extern int  kibnal_connd (void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern void kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status, 
+                                      kib_rx_t *rx, lib_msg_t *libmsg, 
+                                      unsigned int niov, 
+                                      struct iovec *iov, ptl_kiov_t *kiov,
+                                      size_t offset, size_t nob);
+
+void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev);
+void kibnal_ca_callback (void *ca_arg, void *cq_arg);
diff --git a/lustre/portals/knals/iibnal/iibnal_cb.c b/lustre/portals/knals/iibnal/iibnal_cb.c
new file mode 100644 (file)
index 0000000..a827ba5
--- /dev/null
@@ -0,0 +1,3018 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+/*
+ *  LIB functions follow
+ *
+ */
+static void
+kibnal_schedule_tx_done (kib_tx_t *tx)
+{
+        unsigned long flags;
+
+        spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
+
+        list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+        wake_up (&kibnal_data.kib_sched_waitq);
+
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
+static void
+kibnal_tx_done (kib_tx_t *tx)
+{
+        ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
+        unsigned long    flags;
+        int              i;
+        FSTATUS          frc;
+
+        LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
+        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
+
+        switch (tx->tx_mapped) {
+        default:
+                LBUG();
+
+        case KIB_TX_UNMAPPED:
+                break;
+
+        case KIB_TX_MAPPED:
+                if (in_interrupt()) {
+                        /* can't deregister memory in IRQ context... */
+                        kibnal_schedule_tx_done(tx);
+                        return;
+                }
+                frc = iibt_deregister_memory(tx->tx_md.md_handle);
+                LASSERT (frc == FSUCCESS);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+                break;
+
+#if IBNAL_FMR
+        case KIB_TX_MAPPED_FMR:
+                if (in_interrupt() && tx->tx_status != 0) {
+                        /* can't flush FMRs in IRQ context... */
+                        kibnal_schedule_tx_done(tx);
+                        return;
+                }              
+
+                rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
+                LASSERT (rc == 0);
+
+                if (tx->tx_status != 0)
+                        ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+                break;
+#endif
+        }
+
+        for (i = 0; i < 2; i++) {
+                /* tx may have up to 2 libmsgs to finalise */
+                if (tx->tx_libmsg[i] == NULL)
+                        continue;
+
+                lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+                tx->tx_libmsg[i] = NULL;
+        }
+        
+        if (tx->tx_conn != NULL) {
+                kibnal_put_conn (tx->tx_conn);
+                tx->tx_conn = NULL;
+        }
+
+        tx->tx_nsp = 0;
+        tx->tx_passive_rdma = 0;
+        tx->tx_status = 0;
+
+        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+        if (tx->tx_isnblk) {
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
+        } else {
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+                wake_up (&kibnal_data.kib_idle_tx_waitq);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+}
+
+static kib_tx_t *
+kibnal_get_idle_tx (int may_block) 
+{
+        unsigned long  flags;
+        kib_tx_t      *tx = NULL;
+        ENTRY;
+        
+        for (;;) {
+                spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+                /* "normal" descriptor is free */
+                if (!list_empty (&kibnal_data.kib_idle_txs)) {
+                        tx = list_entry (kibnal_data.kib_idle_txs.next,
+                                         kib_tx_t, tx_list);
+                        break;
+                }
+
+                if (!may_block) {
+                        /* may dip into reserve pool */
+                        if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
+                                CERROR ("reserved tx desc pool exhausted\n");
+                                break;
+                        }
+
+                        tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+                                         kib_tx_t, tx_list);
+                        break;
+                }
+
+                /* block for idle tx */
+                spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+                wait_event (kibnal_data.kib_idle_tx_waitq,
+                            !list_empty (&kibnal_data.kib_idle_txs) ||
+                            kibnal_data.kib_shutdown);
+        }
+
+        if (tx != NULL) {
+                list_del (&tx->tx_list);
+
+                /* Allocate a new passive RDMA completion cookie.  It might
+                 * not be needed, but we've got a lock right now and we're
+                 * unlikely to wrap... */
+                tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
+
+                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+                LASSERT (tx->tx_nsp == 0);
+                LASSERT (tx->tx_sending == 0);
+                LASSERT (tx->tx_status == 0);
+                LASSERT (tx->tx_conn == NULL);
+                LASSERT (!tx->tx_passive_rdma);
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_libmsg[0] == NULL);
+                LASSERT (tx->tx_libmsg[1] == NULL);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+        
+        RETURN(tx);
+}
+
+static int
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if kibnal_get_peer (nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->libnal_ni.ni_pid.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+static void
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+{
+        struct list_head *ttmp;
+        unsigned long     flags;
+        int               idle;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (!tx->tx_passive_rdma_wait ||
+                    tx->tx_passive_rdma_cookie != cookie)
+                        continue;
+
+                CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+
+                tx->tx_status = status;
+                tx->tx_passive_rdma_wait = 0;
+                idle = (tx->tx_sending == 0);
+
+                if (idle)
+                        list_del (&tx->tx_list);
+
+                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+                /* I could be racing with tx callbacks.  It's whoever
+                 * _makes_ tx idle that frees it */
+                if (idle)
+                        kibnal_tx_done (tx);
+                return;
+        }
+                
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
+                cookie, conn->ibc_peer->ibp_nid);
+}
+
+static __u32
+kibnal_lkey(kib_pages_t *ibp)
+{
+        if (kibnal_whole_mem())
+                return kibnal_data.kib_md.md_lkey;
+
+        return ibp->ibp_lkey;
+}
+
+static void
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
+{
+        kib_conn_t   *conn = rx->rx_conn;
+        int           rc = 0;
+        unsigned long flags;
+        FSTATUS       frc;
+        ENTRY;
+
+        rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
+                .Address = rx->rx_vaddr,
+                .Length  = IBNAL_MSG_SIZE,
+                .Lkey    = kibnal_lkey(conn->ibc_rx_pages),
+        };
+
+        rx->rx_wrq = (IB_WORK_REQ) {
+                .Operation              = WROpRecv,
+                .DSListDepth            = 1,
+                .MessageLen             = IBNAL_MSG_SIZE,
+                .WorkReqId              = kibnal_ptr2wreqid(rx, 1),
+                .DSList                 = &rx->rx_gl,
+        };
+
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
+                                    IBNAL_CONN_DREP);
+        LASSERT (!rx->rx_posted);
+        rx->rx_posted = 1;
+        mb();
+
+        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+                rc = -ECONNABORTED;
+        else {
+                frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
+                if (frc != FSUCCESS) {
+                        CDEBUG(D_NET, "post failed %d\n", frc);
+                        rc = -EINVAL;
+                }
+                CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+        }
+
+        if (rc == 0) {
+                if (do_credits) {
+                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        conn->ibc_outstanding_credits++;
+                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+                        kibnal_check_sends(conn);
+                }
+                EXIT;
+                return;
+        }
+
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                CERROR ("Error posting receive -> "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, rc);
+                kibnal_close_conn (rx->rx_conn, rc);
+        } else {
+                CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, rc);
+        }
+
+        /* Drop rx's ref */
+        kibnal_put_conn (conn);
+        EXIT;
+}
+
+#if IBNAL_CKSUM
+static inline __u32 kibnal_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+        
+        return (sum);
+}
+#endif
+
+static void hexdump(char *string, void *ptr, int len)
+{
+        unsigned char *c = ptr;
+        int i;
+
+        return;
+
+        if (len < 0 || len > 2048)  {
+                printk("XXX what the hell? %d\n",len);
+                return;
+        }
+
+        printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+
+        for (i = 0; i < len;) {
+                printk("%02x",*(c++));
+                i++;
+                if (!(i & 15)) {
+                        printk("\n");
+                } else if (!(i&1)) {
+                        printk(" ");
+                }
+        }
+
+        if(len & 15) {
+                printk("\n");
+        }
+}
+
+static void
+kibnal_rx_callback (IB_WORK_COMPLETION *wc)
+{
+        kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+        int           nob = wc->Length;
+        const int     base_nob = offsetof(kib_msg_t, ibm_u);
+        int           credits;
+        int           flipped;
+        unsigned long flags;
+        __u32         i;
+#if IBNAL_CKSUM
+        __u32         msg_cksum;
+        __u32         computed_cksum;
+#endif
+
+        /* we set the QP to erroring after we've finished disconnecting, 
+         * maybe we should do so sooner. */
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, 
+                                    IBNAL_CONN_DISCONNECTED);
+
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+        LASSERT (rx->rx_posted);
+        rx->rx_posted = 0;
+        mb();
+
+        /* receives complete with error in any case after we've started
+         * disconnecting */
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+                goto failed;
+
+        if (wc->Status != WRStatusSuccess) {
+                CERROR("Rx from "LPX64" failed: %d\n", 
+                       conn->ibc_peer->ibp_nid, wc->Status);
+                goto failed;
+        }
+
+        if (nob < base_nob) {
+                CERROR ("Short rx from "LPX64": %d < expected %d\n",
+                        conn->ibc_peer->ibp_nid, nob, base_nob);
+                goto failed;
+        }
+
+        hexdump("rx", rx->rx_msg, sizeof(kib_msg_t));
+
+        /* Receiver does any byte flipping if necessary... */
+
+        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+                flipped = 0;
+        } else {
+                if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+                        CERROR ("Unrecognised magic: %08x from "LPX64"\n", 
+                                msg->ibm_magic, conn->ibc_peer->ibp_nid);
+                        goto failed;
+                }
+                flipped = 1;
+                __swab16s (&msg->ibm_version);
+                LASSERT (sizeof(msg->ibm_type) == 1);
+                LASSERT (sizeof(msg->ibm_credits) == 1);
+        }
+
+        if (msg->ibm_version != IBNAL_MSG_VERSION) {
+                CERROR ("Incompatible msg version %d (%d expected)\n",
+                        msg->ibm_version, IBNAL_MSG_VERSION);
+                goto failed;
+        }
+
+#if IBNAL_CKSUM
+        if (nob != msg->ibm_nob) {
+                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
+                goto failed;
+        }
+
+        msg_cksum = le32_to_cpu(msg->ibm_cksum);
+        msg->ibm_cksum = 0;
+        computed_cksum = kibnal_cksum (msg, nob);
+        
+        if (msg_cksum != computed_cksum) {
+                CERROR ("Checksum failure %d: (%d expected)\n",
+                        computed_cksum, msg_cksum);
+//                goto failed;
+        }
+        CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
+#endif
+
+        /* Have I received credits that will let me send? */
+        credits = msg->ibm_credits;
+        if (credits != 0) {
+                spin_lock_irqsave(&conn->ibc_lock, flags);
+                conn->ibc_credits += credits;
+                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                
+                kibnal_check_sends(conn);
+        }
+
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_NOOP:
+                kibnal_post_rx (rx, 1);
+                return;
+
+        case IBNAL_MSG_IMMEDIATE:
+                if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
+                        CERROR ("Short IMMEDIATE from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                break;
+                
+        case IBNAL_MSG_PUT_RDMA:
+        case IBNAL_MSG_GET_RDMA:
+                if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
+                        CERROR ("Short RDMA msg from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                if (flipped) 
+                        __swab32(msg->ibm_u.rdma.ibrm_num_descs);
+
+                CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
+                       msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
+
+                if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) ||
+                    (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > 
+                     min(nob, IBNAL_MSG_SIZE))) {
+                        CERROR ("num_descs %d too large\n", 
+                                msg->ibm_u.rdma.ibrm_num_descs);
+                        goto failed;
+                }
+
+                for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
+                        kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
+
+                        if (flipped) {
+                                __swab32(desc->rd_key);
+                                __swab32(desc->rd_nob);
+                                __swab64(desc->rd_addr);
+                        }
+
+                        CDEBUG(D_NET, "  key %x, " "addr "LPX64", nob %u\n",
+                               desc->rd_key, desc->rd_addr, desc->rd_nob);
+                }
+                break;
+                        
+        case IBNAL_MSG_PUT_DONE:
+        case IBNAL_MSG_GET_DONE:
+                if (nob < base_nob + sizeof (kib_completion_msg_t)) {
+                        CERROR ("Short COMPLETION msg from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                if (flipped)
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
+                
+                CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
+                       msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+                       msg->ibm_u.completion.ibcm_status);
+
+                kibnal_complete_passive_rdma (conn, 
+                                              msg->ibm_u.completion.ibcm_cookie,
+                                              msg->ibm_u.completion.ibcm_status);
+                kibnal_post_rx (rx, 1);
+                return;
+                        
+        default:
+                CERROR ("Can't parse type from "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, msg->ibm_type);
+                goto failed;
+        }
+
+        /* schedule for kibnal_rx() in thread context */
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+        
+        list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+        wake_up (&kibnal_data.kib_sched_waitq);
+        
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+        return;
+        
+ failed:
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+        kibnal_close_conn(conn, -ECONNABORTED);
+
+        /* Don't re-post rx & drop its ref on conn */
+        kibnal_put_conn(conn);
+}
+
+void
+kibnal_rx (kib_rx_t *rx)
+{
+        kib_msg_t   *msg = rx->rx_msg;
+
+        /* Clear flag so I can detect if I've sent an RDMA completion */
+        rx->rx_rdma = 0;
+
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_GET_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+                /* If the incoming get was matched, I'll have initiated the
+                 * RDMA and the completion message... */
+                if (rx->rx_rdma)
+                        break;
+
+                /* Otherwise, I'll send a failed completion now to prevent
+                 * the peer's GET blocking for the full timeout. */
+                CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+                                          rx, NULL, 0, NULL, NULL, 0, 0);
+                break;
+                
+        case IBNAL_MSG_PUT_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+                if (rx->rx_rdma)
+                        break;
+                /* This is most unusual, since even if lib_parse() didn't
+                 * match anything, it should have asked us to read (and
+                 * discard) the payload.  The portals header must be
+                 * inconsistent with this message type, so it's the
+                 * sender's fault for sending garbage and she can time
+                 * herself out... */
+                CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                break;
+
+        case IBNAL_MSG_IMMEDIATE:
+                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
+                LASSERT (!rx->rx_rdma);
+                break;
+                
+        default:
+                LBUG();
+                break;
+        }
+
+        kibnal_post_rx (rx, 1);
+}
+
+static struct page *
+kibnal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END)
+                page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+        else if (vaddr >= PKMAP_BASE &&
+                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+                page = vmalloc_to_page ((void *)vaddr);
+        /* in 2.4 ^ just walks the page tables */
+#endif
+        else
+                page = virt_to_page (vaddr);
+
+        if (!VALID_PAGE (page))
+                page = NULL;
+
+        return page;
+}
+
+static void
+kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
+                 unsigned long len, int active)
+{
+        kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
+        kib_rdma_desc_t *desc;
+
+        LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", 
+                 ibrm->ibrm_num_descs);
+
+        desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
+        if (active)
+                desc->rd_key = kibnal_data.kib_md.md_lkey;
+        else
+                desc->rd_key = kibnal_data.kib_md.md_rkey;
+        desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
+        desc->rd_addr = kibnal_page2phys(page) + page_offset +
+                        kibnal_data.kib_md.md_addr;
+
+        ibrm->ibrm_num_descs++;
+}
+
+static int
+kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
+{
+        struct page *page;
+        int page_offset, len;
+
+        while (nob > 0) {
+                page = kibnal_kvaddr_to_page(vaddr);
+                if (page == NULL)
+                        return -EFAULT;
+
+                page_offset = vaddr & (PAGE_SIZE - 1);
+                len = min(nob, (int)PAGE_SIZE - page_offset);
+                
+                kibnal_fill_ibrm(tx, page, page_offset, len, active);
+                nob -= len;
+                vaddr += len;
+        }
+        return 0;
+}
+
+static int
+kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+                 int niov, struct iovec *iov, int offset, int nob, int active)
+                 
+{
+        void   *vaddr;
+        FSTATUS frc;
+
+        LASSERT (nob > 0);
+        LASSERT (niov > 0);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+                LASSERT (niov > 0);
+        }
+
+        if (nob > iov->iov_len - offset) {
+                CERROR ("Can't map multiple vaddr fragments\n");
+                return (-EMSGSIZE);
+        }
+
+        /* our large contiguous iov could be backed by multiple physical
+         * pages. */
+        if (kibnal_whole_mem()) {
+                int rc;
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+                rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + 
+                                         offset, nob, active);
+                if (rc != 0) {
+                        CERROR ("Can't map iov: %d\n", rc);
+                        return rc;
+                }
+                return 0;
+        }
+
+        vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
+        tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+
+        frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob,
+                                   kibnal_data.kib_pd, access,
+                                   &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
+                                   &tx->tx_md.md_rkey);
+        if (frc != 0) {
+                CERROR ("Can't map vaddr %p: %d\n", vaddr, frc);
+                return -EINVAL;
+        }
+
+        tx->tx_mapped = KIB_TX_MAPPED;
+        return (0);
+}
+
+static int
+kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+                  int nkiov, ptl_kiov_t *kiov,
+                  int offset, int nob, int active)
+{
+        __u64                      *phys = NULL;
+        int                         page_offset;
+        int                         nphys;
+        int                         resid;
+        int                         phys_size = 0;
+        FSTATUS                     frc;
+        int                         i, rc = 0;
+
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
+        }
+
+        page_offset = kiov->kiov_offset + offset;
+        nphys = 1;
+
+        if (!kibnal_whole_mem()) {
+                phys_size = nkiov * sizeof (*phys);
+                PORTAL_ALLOC(phys, phys_size);
+                if (phys == NULL) {
+                        CERROR ("Can't allocate tmp phys\n");
+                        return (-ENOMEM);
+                }
+
+                phys[0] = kibnal_page2phys(kiov->kiov_page);
+        } else {
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+                kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, 
+                                 kiov->kiov_len, active);
+        }
+
+        resid = nob - (kiov->kiov_len - offset);
+
+        while (resid > 0) {
+                kiov++;
+                nkiov--;
+                LASSERT (nkiov > 0);
+
+                if (kiov->kiov_offset != 0 ||
+                    ((resid > PAGE_SIZE) && 
+                     kiov->kiov_len < PAGE_SIZE)) {
+                        /* Can't have gaps */
+                        CERROR ("Can't make payload contiguous in I/O VM:"
+                                "page %d, offset %d, len %d \n", nphys, 
+                                kiov->kiov_offset, kiov->kiov_len);
+
+                        for (i = -nphys; i < nkiov; i++) 
+                        {
+                                CERROR("kiov[%d] %p +%d for %d\n",
+                                       i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
+                        }
+                        
+                        rc = -EINVAL;
+                        goto out;
+                }
+
+                if (nphys == PTL_MD_MAX_IOV) {
+                        CERROR ("payload too big (%d)\n", nphys);
+                        rc = -EMSGSIZE;
+                        goto out;
+                }
+
+                if (!kibnal_whole_mem()) {
+                        LASSERT (nphys * sizeof (*phys) < phys_size);
+                        phys[nphys] = kibnal_page2phys(kiov->kiov_page);
+                } else {
+                        if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
+                                CERROR ("payload too big (%d)\n", nphys);
+                                rc = -EMSGSIZE;
+                                goto out;
+                        }
+                        kibnal_fill_ibrm(tx, kiov->kiov_page, 
+                                         kiov->kiov_offset, kiov->kiov_len,
+                                         active);
+                }
+
+                nphys ++;
+                resid -= PAGE_SIZE;
+        }
+
+        if (kibnal_whole_mem())
+                goto out;
+
+#if 0
+        CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
+        for (i = 0; i < nphys; i++)
+                CWARN ("   [%d] "LPX64"\n", i, phys[i]);
+#endif
+
+#if IBNAL_FMR
+#error "iibnal hasn't learned about FMR yet"
+        rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
+                                       phys, nphys,
+                                       &tx->tx_md.md_addr,
+                                       page_offset,
+                                       &tx->tx_md.md_handle.fmr,
+                                       &tx->tx_md.md_lkey,
+                                       &tx->tx_md.md_rkey);
+#else
+        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+                                            IBNAL_RDMA_BASE,
+                                            phys, nphys,
+                                            0,          /* offset */
+                                            kibnal_data.kib_pd,
+                                            access,
+                                            &tx->tx_md.md_handle,
+                                            &tx->tx_md.md_addr,
+                                            &tx->tx_md.md_lkey,
+                                            &tx->tx_md.md_rkey);
+#endif
+        if (frc == FSUCCESS) {
+                CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
+                       nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
+#if IBNAL_FMR
+                tx->tx_mapped = KIB_TX_MAPPED_FMR;
+#else
+                tx->tx_mapped = KIB_TX_MAPPED;
+#endif
+        } else {
+                CERROR ("Can't map phys: %d\n", rc);
+                rc = -EFAULT;
+        }
+
+ out:
+        if (phys != NULL)
+                PORTAL_FREE(phys, phys_size);
+        return (rc);
+}
+
+static kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
+{
+        struct list_head *tmp;
+
+        /* just return the first connection */
+        list_for_each (tmp, &peer->ibp_conns) {
+                return (list_entry(tmp, kib_conn_t, ibc_list));
+        }
+
+        return (NULL);
+}
+
+void
+kibnal_check_sends (kib_conn_t *conn)
+{
+        unsigned long   flags;
+        kib_tx_t       *tx;
+        int             rc;
+        int             i;
+        int             done;
+        int             nwork;
+        ENTRY;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
+        if (list_empty(&conn->ibc_tx_queue) &&
+            conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
+                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                
+                tx = kibnal_get_idle_tx(0);     /* don't block */
+                if (tx != NULL)
+                        kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
+
+                spin_lock_irqsave(&conn->ibc_lock, flags);
+                
+                if (tx != NULL) {
+                        atomic_inc(&conn->ibc_refcount);
+                        kibnal_queue_tx_locked(tx, conn);
+                }
+        }
+
+        while (!list_empty (&conn->ibc_tx_queue)) {
+                tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+
+                /* We rely on this for QP sizing */
+                LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+
+                LASSERT (conn->ibc_outstanding_credits >= 0);
+                LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_credits >= 0);
+                LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
+
+                /* Not on ibc_rdma_queue */
+                LASSERT (!tx->tx_passive_rdma_wait);
+
+                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
+                        GOTO(out, 0);
+
+                if (conn->ibc_credits == 0)     /* no credits */
+                        GOTO(out, 1);
+                
+                if (conn->ibc_credits == 1 &&   /* last credit reserved for */
+                    conn->ibc_outstanding_credits == 0) /* giving back credits */
+                        GOTO(out, 2);
+
+                list_del (&tx->tx_list);
+
+                if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
+                    (!list_empty(&conn->ibc_tx_queue) ||
+                     conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+                        /* redundant NOOP */
+                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                        kibnal_tx_done(tx);
+                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        continue;
+                }
+
+                tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
+                conn->ibc_outstanding_credits = 0;
+
+                conn->ibc_nsends_posted++;
+                conn->ibc_credits--;
+
+                /* we only get a tx completion for the final rdma op */ 
+                tx->tx_sending = min(tx->tx_nsp, 2);
+                tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+                list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+                tx->tx_msg->ibm_cksum = 0;
+                tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
+#endif
+                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+                /* NB the gap between removing tx from the queue and sending it
+                 * allows message re-ordering to occur */
+
+                LASSERT (tx->tx_nsp > 0);
+
+                rc = -ECONNABORTED;
+                nwork = 0;
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                        tx->tx_status = 0;
+                        /* Driver only accepts 1 item at a time */
+                        for (i = 0; i < tx->tx_nsp; i++) {
+                                hexdump("tx", tx->tx_msg, sizeof(kib_msg_t));
+                                rc = iibt_postsend(conn->ibc_qp, 
+                                                   &tx->tx_wrq[i]);
+                                if (rc != 0)
+                                        break;
+                                if (wrq_signals_completion(&tx->tx_wrq[i]))
+                                        nwork++;
+                                CDEBUG(D_NET, "posted tx wrq %p\n", 
+                                       &tx->tx_wrq[i]);
+                        }
+                }
+
+                spin_lock_irqsave (&conn->ibc_lock, flags);
+                if (rc != 0) {
+                        /* NB credits are transferred in the actual
+                         * message, which can only be the last work item */
+                        conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
+                        conn->ibc_credits++;
+                        conn->ibc_nsends_posted--;
+
+                        tx->tx_status = rc;
+                        tx->tx_passive_rdma_wait = 0;
+                        tx->tx_sending -= tx->tx_nsp - nwork;
+
+                        done = (tx->tx_sending == 0);
+                        if (done)
+                                list_del (&tx->tx_list);
+                        
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        
+                        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+                                CERROR ("Error %d posting transmit to "LPX64"\n", 
+                                        rc, conn->ibc_peer->ibp_nid);
+                        else
+                                CDEBUG (D_NET, "Error %d posting transmit to "
+                                        LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+
+                        kibnal_close_conn (conn, rc);
+
+                        if (done)
+                                kibnal_tx_done (tx);
+                        return;
+                }
+                
+        }
+
+        EXIT;
+out:
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+}
+
+static void
+kibnal_tx_callback (IB_WORK_COMPLETION *wc)
+{
+        kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+        kib_conn_t   *conn;
+        unsigned long flags;
+        int           idle;
+
+        conn = tx->tx_conn;
+        LASSERT (conn != NULL);
+        LASSERT (tx->tx_sending != 0);
+
+        spin_lock_irqsave(&conn->ibc_lock, flags);
+
+        CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
+               tx->tx_sending, tx->tx_nsp, wc->Status);
+
+        /* I could be racing with rdma completion.  Whoever makes 'tx' idle
+         * gets to free it, which also drops its ref on 'conn'.  If it's
+         * not me, then I take an extra ref on conn so it can't disappear
+         * under me. */
+
+        tx->tx_sending--;
+        idle = (tx->tx_sending == 0) &&         /* This is the final callback */
+               (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
+        if (idle)
+                list_del(&tx->tx_list);
+
+        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+               atomic_read (&conn->ibc_refcount));
+        atomic_inc (&conn->ibc_refcount);
+
+        if (tx->tx_sending == 0)
+                conn->ibc_nsends_posted--;
+
+        if (wc->Status != WRStatusSuccess &&
+            tx->tx_status == 0)
+                tx->tx_status = -ECONNABORTED;
+                
+        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+        if (idle)
+                kibnal_tx_done (tx);
+
+        if (wc->Status != WRStatusSuccess) {
+                CERROR ("Tx completion to "LPX64" failed: %d\n", 
+                        conn->ibc_peer->ibp_nid, wc->Status);
+                kibnal_close_conn (conn, -ENETDOWN);
+        } else {
+                /* can I shovel some more sends out the door? */
+                kibnal_check_sends(conn);
+        }
+
+        kibnal_put_conn (conn);
+}
+
+void 
+kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev)
+{
+        /* XXX flesh out.  this seems largely for async errors */
+        CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
+}
+
+void
+kibnal_ca_callback (void *ca_arg, void *cq_arg)
+{
+        IB_HANDLE cq = *(IB_HANDLE *)cq_arg;
+        IB_HANDLE ca = *(IB_HANDLE *)ca_arg;
+        IB_WORK_COMPLETION wc;
+        int armed = 0;
+
+        CDEBUG(D_NET, "ca %p cq %p\n", ca, cq);
+
+        for(;;) {
+                while (iibt_cq_poll(cq, &wc) == FSUCCESS) {
+                        if (kibnal_wreqid_is_rx(wc.WorkReqId))
+                                kibnal_rx_callback(&wc);
+                        else
+                                kibnal_tx_callback(&wc);
+                }
+                if (armed)
+                        return;
+                if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) {
+                        CERROR("rearm failed?\n");
+                        return;
+                }
+                armed = 1;
+        }
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
+{
+        IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp];
+        IB_WORK_REQ         *wrq = &tx->tx_wrq[tx->tx_nsp];
+        int                       fence;
+        int                       nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+
+        LASSERT (tx->tx_nsp >= 0 && 
+                 tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+        LASSERT (nob <= IBNAL_MSG_SIZE);
+        
+        tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+        tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+        tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+        tx->tx_msg->ibm_nob = nob;
+#endif
+        /* Fence the message if it's bundled with an RDMA read */
+        fence = (tx->tx_nsp > 0) &&
+                (type == IBNAL_MSG_PUT_DONE);
+
+        *gl = (IB_LOCAL_DATASEGMENT) {
+                .Address = tx->tx_vaddr,
+                .Length  = IBNAL_MSG_SIZE,
+                .Lkey    = kibnal_lkey(kibnal_data.kib_tx_pages),
+        };
+
+        wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
+        wrq->Operation      = WROpSend;
+        wrq->DSList         = gl;
+        wrq->DSListDepth    = 1;
+        wrq->MessageLen     = nob;
+        wrq->Req.SendRC.ImmediateData  = 0;
+        wrq->Req.SendRC.Options.s.SolicitedEvent         = 1;
+        wrq->Req.SendRC.Options.s.SignaledCompletion     = 1;
+        wrq->Req.SendRC.Options.s.ImmediateData          = 0;
+        wrq->Req.SendRC.Options.s.Fence                  = fence;
+
+        tx->tx_nsp++;
+}
+
+static void
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+        unsigned long         flags;
+
+        spin_lock_irqsave(&conn->ibc_lock, flags);
+
+        kibnal_queue_tx_locked (tx, conn);
+        
+        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+        
+        kibnal_check_sends(conn);
+}
+
+static void
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
+{
+        unsigned long    flags;
+        kib_peer_t      *peer;
+        kib_conn_t      *conn;
+        rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
+
+        /* If I get here, I've committed to send, so I complete the tx with
+         * failure on any problems */
+        
+        LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
+        LASSERT (tx->tx_nsp > 0);               /* work items have been set up */
+
+        read_lock (g_lock);
+        
+        peer = kibnal_find_peer_locked (nid);
+        if (peer == NULL) {
+                read_unlock (g_lock);
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+                return;
+        }
+
+        conn = kibnal_find_conn_locked (peer);
+        if (conn != NULL) {
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                read_unlock (g_lock);
+                
+                kibnal_queue_tx (tx, conn);
+                return;
+        }
+        
+        /* Making one or more connections; I'll need a write lock... */
+        read_unlock (g_lock);
+        write_lock_irqsave (g_lock, flags);
+
+        peer = kibnal_find_peer_locked (nid);
+        if (peer == NULL) {
+                write_unlock_irqrestore (g_lock, flags);
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+                return;
+        }
+
+        conn = kibnal_find_conn_locked (peer);
+        if (conn != NULL) {
+                /* Connection exists; queue message on it */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                write_unlock_irqrestore (g_lock, flags);
+                
+                kibnal_queue_tx (tx, conn);
+                return;
+        }
+
+        if (peer->ibp_connecting == 0) {
+                if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
+                        write_unlock_irqrestore (g_lock, flags);
+                        tx->tx_status = -EHOSTUNREACH;
+                        kibnal_tx_done (tx);
+                        return;
+                }
+        
+                peer->ibp_connecting = 1;
+                kib_peer_addref(peer); /* extra ref for connd */
+        
+                spin_lock (&kibnal_data.kib_connd_lock);
+        
+                list_add_tail (&peer->ibp_connd_list,
+                               &kibnal_data.kib_connd_peers);
+                wake_up (&kibnal_data.kib_connd_waitq);
+        
+                spin_unlock (&kibnal_data.kib_connd_lock);
+        }
+        
+        /* A connection is being established; queue the message... */
+        list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
+
+        write_unlock_irqrestore (g_lock, flags);
+}
+
+static ptl_err_t
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
+                            lib_msg_t *libmsg, ptl_hdr_t *hdr)
+{
+        int         nob = libmsg->md->length;
+        kib_tx_t   *tx;
+        kib_msg_t  *ibmsg;
+        int         rc;
+        IB_ACCESS_CONTROL         access = {0,};
+        
+        LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
+        LASSERT (nob > 0);
+        LASSERT (!in_interrupt());              /* Mapping could block */
+
+        access.s.MWBindable = 1;
+        access.s.LocalWrite = 1;
+        access.s.RdmaRead = 1;
+        access.s.RdmaWrite = 1;
+
+        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
+        LASSERT (tx != NULL);
+
+        if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
+                rc = kibnal_map_iov (tx, access,
+                                     libmsg->md->md_niov,
+                                     libmsg->md->md_iov.iov,
+                                     0, nob, 0);
+        else
+                rc = kibnal_map_kiov (tx, access,
+                                      libmsg->md->md_niov, 
+                                      libmsg->md->md_iov.kiov,
+                                      0, nob, 0);
+
+        if (rc != 0) {
+                CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
+                goto failed;
+        }
+        
+        if (type == IBNAL_MSG_GET_RDMA) {
+                /* reply gets finalized when tx completes */
+                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, 
+                                                        nid, libmsg);
+                if (tx->tx_libmsg[1] == NULL) {
+                        CERROR ("Can't create reply for GET -> "LPX64"\n",
+                                nid);
+                        rc = -ENOMEM;
+                        goto failed;
+                }
+        }
+        
+        tx->tx_passive_rdma = 1;
+
+        ibmsg = tx->tx_msg;
+
+        ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+        ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+        /* map_kiov alrady filled the rdma descs for the whole_mem case */
+        if (!kibnal_whole_mem()) {
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey;
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+                ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
+        }
+
+        kibnal_init_tx_msg (tx, type, 
+                            kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
+
+        CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
+               LPX64", nob %d\n",
+               tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
+               tx->tx_md.md_addr, nob);
+        
+        /* libmsg gets finalized when tx completes. */
+        tx->tx_libmsg[0] = libmsg;
+
+        kibnal_launch_tx(tx, nid);
+        return (PTL_OK);
+
+ failed:
+        tx->tx_status = rc;
+        kibnal_tx_done (tx);
+        return (PTL_FAIL);
+}
+
+void
+kibnal_start_active_rdma (int type, int status,
+                           kib_rx_t *rx, lib_msg_t *libmsg, 
+                           unsigned int niov,
+                           struct iovec *iov, ptl_kiov_t *kiov,
+                           size_t offset, size_t nob)
+{
+        kib_msg_t    *rxmsg = rx->rx_msg;
+        kib_msg_t    *txmsg;
+        kib_tx_t     *tx;
+        IB_ACCESS_CONTROL access = {0,};
+        IB_WR_OP      rdma_op;
+        int           rc;
+        __u32         i;
+
+        CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
+               type, status, niov, offset, nob);
+
+        /* Called by scheduler */
+        LASSERT (!in_interrupt ());
+
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+
+        /* No data if we're completing with failure */
+        LASSERT (status == 0 || nob == 0);
+
+        LASSERT (type == IBNAL_MSG_GET_DONE ||
+                 type == IBNAL_MSG_PUT_DONE);
+
+        /* Flag I'm completing the RDMA.  Even if I fail to send the
+         * completion message, I will have tried my best so further
+         * attempts shouldn't be tried. */
+        LASSERT (!rx->rx_rdma);
+        rx->rx_rdma = 1;
+
+        if (type == IBNAL_MSG_GET_DONE) {
+                rdma_op  = WROpRdmaWrite;
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
+        } else {
+                access.s.LocalWrite = 1;
+                rdma_op  = WROpRdmaRead;
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
+        }
+
+        tx = kibnal_get_idle_tx (0);           /* Mustn't block */
+        if (tx == NULL) {
+                CERROR ("tx descs exhausted on RDMA from "LPX64
+                        " completing locally with failure\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+                return;
+        }
+        LASSERT (tx->tx_nsp == 0);
+                        
+        if (nob == 0) 
+                GOTO(init_tx, 0);
+
+        /* We actually need to transfer some data (the transfer
+         * size could get truncated to zero when the incoming
+         * message is matched) */
+        if (kiov != NULL)
+                rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
+        else
+                rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
+        
+        if (rc != 0) {
+                CERROR ("Can't map RDMA -> "LPX64": %d\n", 
+                        rx->rx_conn->ibc_peer->ibp_nid, rc);
+                /* We'll skip the RDMA and complete with failure. */
+                status = rc;
+                nob = 0;
+                GOTO(init_tx, rc);
+        } 
+
+        if (!kibnal_whole_mem()) {
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey;
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
+        }
+
+        /* XXX ugh.  different page-sized hosts. */ 
+        if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
+            rxmsg->ibm_u.rdma.ibrm_num_descs) {
+                CERROR("tx descs (%u) != rx descs (%u)\n", 
+                       tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
+                       rxmsg->ibm_u.rdma.ibrm_num_descs);
+                /* We'll skip the RDMA and complete with failure. */
+                status = rc;
+                nob = 0;
+                GOTO(init_tx, rc);
+        }
+
+        /* map_kiov filled in the rdma descs which describe our side of the
+         * rdma transfer. */
+        /* ibrm_num_descs was verified in rx_callback */
+        for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
+                kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
+                IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i];
+                IB_WORK_REQ  *wrq = &tx->tx_wrq[i];
+
+                ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
+                rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
+
+                ds->Address = ldesc->rd_addr;
+                ds->Length  = ldesc->rd_nob;
+                ds->Lkey    = ldesc->rd_key;
+
+                memset(wrq, 0, sizeof(*wrq));
+                wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
+                wrq->Operation      = rdma_op;
+                wrq->DSList         = ds;
+                wrq->DSListDepth    = 1;
+                wrq->MessageLen     = ds->Length;
+                wrq->Req.SendRC.ImmediateData  = 0;
+                wrq->Req.SendRC.Options.s.SolicitedEvent         = 0;
+                wrq->Req.SendRC.Options.s.SignaledCompletion     = 0;
+                wrq->Req.SendRC.Options.s.ImmediateData          = 0;
+                wrq->Req.SendRC.Options.s.Fence                  = 0;
+                wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr;
+                wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key;
+
+                /* only the last rdma post triggers tx completion */
+                if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
+                        wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+
+                tx->tx_nsp++;
+        }
+
+init_tx:
+        txmsg = tx->tx_msg;
+
+        txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+        txmsg->ibm_u.completion.ibcm_status = status;
+        
+        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+
+        if (status == 0 && nob != 0) {
+                LASSERT (tx->tx_nsp > 1);
+                /* RDMA: libmsg gets finalized when the tx completes.  This
+                 * is after the completion message has been sent, which in
+                 * turn is after the RDMA has finished. */
+                tx->tx_libmsg[0] = libmsg;
+        } else {
+                LASSERT (tx->tx_nsp == 1);
+                /* No RDMA: local completion happens now! */
+                CDEBUG(D_WARNING,"No data: immediate completion\n");
+                lib_finalize (&kibnal_lib, NULL, libmsg,
+                              status == 0 ? PTL_OK : PTL_FAIL);
+        }
+
+        /* +1 ref for this tx... */
+        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+               rx->rx_conn, rx->rx_conn->ibc_state, 
+               rx->rx_conn->ibc_peer->ibp_nid,
+               atomic_read (&rx->rx_conn->ibc_refcount));
+        atomic_inc (&rx->rx_conn->ibc_refcount);
+        /* ...and queue it up */
+        kibnal_queue_tx(tx, rx->rx_conn);
+}
+
+static ptl_err_t
+kibnal_sendmsg(lib_nal_t    *nal, 
+                void         *private,
+                lib_msg_t    *libmsg,
+                ptl_hdr_t    *hdr, 
+                int           type, 
+                ptl_nid_t     nid, 
+                ptl_pid_t     pid,
+                unsigned int  payload_niov, 
+                struct iovec *payload_iov, 
+                ptl_kiov_t   *payload_kiov,
+                size_t        payload_offset,
+                size_t        payload_nob)
+{
+        kib_msg_t  *ibmsg;
+        kib_tx_t   *tx;
+        int         nob;
+
+        /* NB 'private' is different depending on what we're sending.... */
+
+        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
+               " pid %d\n", payload_nob, payload_niov, nid , pid);
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        /* Thread context if we're sending payload */
+        LASSERT (!in_interrupt() || payload_niov == 0);
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+        switch (type) {
+        default:
+                LBUG();
+                return (PTL_FAIL);
+                
+        case PTL_MSG_REPLY: {
+                /* reply's 'private' is the incoming receive */
+                kib_rx_t *rx = private;
+
+                /* RDMA reply expected? */
+                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+                                                 rx, libmsg, payload_niov, 
+                                                 payload_iov, payload_kiov,
+                                                 payload_offset, payload_nob);
+                        return (PTL_OK);
+                }
+                
+                /* Incoming message consistent with immediate reply? */
+                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
+                        CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
+                                nid, rx->rx_msg->ibm_type);
+                        return (PTL_FAIL);
+                }
+
+                /* Will it fit in a message? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob >= IBNAL_MSG_SIZE) {
+                        CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", 
+                               nid, payload_nob);
+                        return (PTL_FAIL);
+                }
+                break;
+        }
+
+        case PTL_MSG_GET:
+                /* might the REPLY message be big enough to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
+                                                          nid, libmsg, hdr));
+                break;
+
+        case PTL_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
+
+        case PTL_MSG_PUT:
+                /* Is the payload big enough to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+                                                          nid, libmsg, hdr));
+                
+                break;
+        }
+
+        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt()));
+        if (tx == NULL) {
+                CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
+                        type, nid, in_interrupt() ? " (intr)" : "");
+                return (PTL_NO_SPACE);
+        }
+
+        ibmsg = tx->tx_msg;
+        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+        if (payload_nob > 0) {
+                if (payload_kiov != NULL)
+                        lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+                                          payload_niov, payload_kiov,
+                                          payload_offset, payload_nob);
+                else
+                        lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+                                         payload_niov, payload_iov,
+                                         payload_offset, payload_nob);
+        }
+
+        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+                            offsetof(kib_immediate_msg_t, 
+                                     ibim_payload[payload_nob]));
+
+        /* libmsg gets finalized when tx completes */
+        tx->tx_libmsg[0] = libmsg;
+
+        kibnal_launch_tx(tx, nid);
+        return (PTL_OK);
+}
+
+static ptl_err_t
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+               unsigned int payload_niov, struct iovec *payload_iov,
+               size_t payload_offset, size_t payload_len)
+{
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, payload_iov, NULL,
+                               payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
+                     size_t payload_offset, size_t payload_len)
+{
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, NULL, payload_kiov,
+                               payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+                 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
+                 size_t offset, size_t mlen, size_t rlen)
+{
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        int          msg_nob;
+        
+        LASSERT (mlen <= rlen);
+        LASSERT (!in_interrupt ());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+
+        switch (rxmsg->ibm_type) {
+        default:
+                LBUG();
+                return (PTL_FAIL);
+                
+        case IBNAL_MSG_IMMEDIATE:
+                msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (msg_nob > IBNAL_MSG_SIZE) {
+                        CERROR ("Immediate message from "LPX64" too big: %d\n",
+                                rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
+                        return (PTL_FAIL);
+                }
+
+                if (kiov != NULL)
+                        lib_copy_buf2kiov(niov, kiov, offset,
+                                          rxmsg->ibm_u.immediate.ibim_payload,
+                                          mlen);
+                else
+                        lib_copy_buf2iov(niov, iov, offset,
+                                         rxmsg->ibm_u.immediate.ibim_payload,
+                                         mlen);
+
+                lib_finalize (nal, NULL, libmsg, PTL_OK);
+                return (PTL_OK);
+
+        case IBNAL_MSG_GET_RDMA:
+                /* We get called here just to discard any junk after the
+                 * GET hdr. */
+                LASSERT (libmsg == NULL);
+                lib_finalize (nal, NULL, libmsg, PTL_OK);
+                return (PTL_OK);
+
+        case IBNAL_MSG_PUT_RDMA:
+                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+                                          rx, libmsg, 
+                                          niov, iov, kiov, offset, mlen);
+                return (PTL_OK);
+        }
+}
+
+static ptl_err_t
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+              unsigned int niov, struct iovec *iov, 
+              size_t offset, size_t mlen, size_t rlen)
+{
+        return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+                                offset, mlen, rlen));
+}
+
+static ptl_err_t
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+                     unsigned int niov, ptl_kiov_t *kiov, 
+                     size_t offset, size_t mlen, size_t rlen)
+{
+        return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+                                offset, mlen, rlen));
+}
+
+/*****************************************************************************
+ * the rest of this file concerns connection management.  active connetions
+ * start with connect_peer, passive connections start with passive_callback.
+ * active disconnects start with conn_close, cm_callback starts passive
+ * disconnects and contains the guts of how the disconnect state machine
+ * progresses. 
+ *****************************************************************************/
+
+int
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&kibnal_data.kib_nthreads);
+        return (0);
+}
+
+static void
+kibnal_thread_fini (void)
+{
+        atomic_dec (&kibnal_data.kib_nthreads);
+}
+
+/* this can be called by anyone at any time to close a connection.  if
+ * the connection is still established it heads to the connd to start
+ * the disconnection in a safe context.  It has no effect if called
+ * on a connection that is already disconnecting */
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+        /* This just does the immmediate housekeeping, and schedules the
+         * connection for the connd to finish off.
+         * Caller holds kib_global_lock exclusively in irq context */
+        kib_peer_t   *peer = conn->ibc_peer;
+
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
+                                    IBNAL_CONN_DISCONNECTED);
+
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+                return; /* already disconnecting */
+
+        CDEBUG (error == 0 ? D_NET : D_ERROR,
+                "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
+
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                /* kib_connd_conns takes ibc_list's ref */
+                list_del (&conn->ibc_list);
+        } else {
+                /* new ref for kib_connd_conns */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount);
+        }
+        
+        if (list_empty (&peer->ibp_conns) &&
+            peer->ibp_persistence == 0) {
+                /* Non-persistent peer with no more conns... */
+                kibnal_unlink_peer_locked (peer);
+        }
+
+        conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+
+        spin_lock (&kibnal_data.kib_connd_lock);
+
+        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+                
+        spin_unlock (&kibnal_data.kib_connd_lock);
+}
+
+void
+kibnal_close_conn (kib_conn_t *conn, int error)
+{
+        unsigned long     flags;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        kibnal_close_conn_locked (conn, error);
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+}
+
+static void
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
+{
+        LIST_HEAD        (zombies);
+        kib_tx_t         *tx;
+        unsigned long     flags;
+
+        LASSERT (rc != 0);
+        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        LASSERT (peer->ibp_connecting != 0);
+        peer->ibp_connecting--;
+
+        if (peer->ibp_connecting != 0) {
+                /* another connection attempt under way (loopback?)... */
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+                return;
+        }
+
+        if (list_empty(&peer->ibp_conns)) {
+                /* Say when active connection can be re-attempted */
+                peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
+                /* Increase reconnection interval */
+                peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
+                                                    IBNAL_MAX_RECONNECT_INTERVAL);
+        
+                /* Take peer's blocked blocked transmits; I'll complete
+                 * them with error */
+                while (!list_empty (&peer->ibp_tx_queue)) {
+                        tx = list_entry (peer->ibp_tx_queue.next,
+                                         kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+                        list_add_tail (&tx->tx_list, &zombies);
+                }
+                
+                if (kibnal_peer_active(peer) &&
+                    (peer->ibp_persistence == 0)) {
+                        /* failed connection attempt on non-persistent peer */
+                        kibnal_unlink_peer_locked (peer);
+                }
+        } else {
+                /* Can't have blocked transmits if there are connections */
+                LASSERT (list_empty(&peer->ibp_tx_queue));
+        }
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        if (!list_empty (&zombies))
+                CERROR ("Deleting messages for "LPX64": connection failed\n",
+                        peer->ibp_nid);
+
+        while (!list_empty (&zombies)) {
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                list_del (&tx->tx_list);
+                /* complete now */
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+        }
+}
+
+static void
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
+{
+        int               state = conn->ibc_state;
+        kib_peer_t       *peer = conn->ibc_peer;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               i;
+
+        /* passive connection has no connreq & vice versa */
+        LASSERTF(!active == !(conn->ibc_connreq != NULL),
+                 "%d %p\n", active, conn->ibc_connreq);
+        if (active) {
+                PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+                conn->ibc_connreq = NULL;
+        }
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        LASSERT (peer->ibp_connecting != 0);
+        
+        if (status == 0) {                         
+                /* connection established... */
+                KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
+                conn->ibc_state = IBNAL_CONN_ESTABLISHED;
+
+                if (!kibnal_peer_active(peer)) {
+                        /* ...but peer deleted meantime */
+                        status = -ECONNABORTED;
+                }
+        } else {
+                KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
+                                            IBNAL_CONN_CONNECTING);
+        }
+
+        if (status == 0) {
+                /* Everything worked! */
+
+                peer->ibp_connecting--;
+
+                /* +1 ref for ibc_list; caller(== CM)'s ref remains until
+                 * the IB_CM_IDLE callback */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount);
+                list_add (&conn->ibc_list, &peer->ibp_conns);
+                
+                /* reset reconnect interval for next attempt */
+                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+                /* post blocked sends to the new connection */
+                spin_lock (&conn->ibc_lock);
+                
+                while (!list_empty (&peer->ibp_tx_queue)) {
+                        tx = list_entry (peer->ibp_tx_queue.next, 
+                                         kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+
+                        /* +1 ref for each tx */
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+                        atomic_inc (&conn->ibc_refcount);
+                        kibnal_queue_tx_locked (tx, conn);
+                }
+                
+                spin_unlock (&conn->ibc_lock);
+
+                /* Nuke any dangling conns from a different peer instance... */
+                kibnal_close_stale_conns_locked (conn->ibc_peer,
+                                                 conn->ibc_incarnation);
+
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+                /* queue up all the receives */
+                for (i = 0; i < IBNAL_RX_MSGS; i++) {
+                        /* +1 ref for rx desc */
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+                        atomic_inc (&conn->ibc_refcount);
+
+                        CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
+                               i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
+                               conn->ibc_rxs[i].rx_vaddr);
+
+                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
+                }
+
+                kibnal_check_sends (conn);
+                return;
+        }
+
+        /* connection failed */
+        if (state == IBNAL_CONN_CONNECTING) {
+                /* schedule for connd to close */
+                kibnal_close_conn_locked (conn, status);
+        } else {
+                /* Don't have a CM comm_id; just wait for refs to drain */
+                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+        } 
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+
+        /* If we didn't establish the connection we don't have to pass
+         * through the disconnect protocol before dropping the CM ref */
+        if (state < IBNAL_CONN_CONNECTING) 
+                kibnal_put_conn (conn);
+}
+
+static int
+kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
+                ptl_nid_t nid, __u64 incarnation, int queue_depth)
+{
+        kib_conn_t    *conn = kibnal_create_conn();
+        kib_peer_t    *peer;
+        kib_peer_t    *peer2;
+        unsigned long  flags;
+
+        if (conn == NULL)
+                return (-ENOMEM);
+
+        if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
+                CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
+                       nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
+                atomic_dec (&conn->ibc_refcount);
+                kibnal_destroy_conn(conn);
+                return (-EPROTO);
+        }
+        
+        /* assume 'nid' is a new peer */
+        peer = kibnal_create_peer (nid);
+        if (peer == NULL) {
+                CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_dec (&conn->ibc_refcount);
+                kibnal_destroy_conn(conn);
+                return (-ENOMEM);
+        }
+        
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        peer2 = kibnal_find_peer_locked(nid);
+        if (peer2 == NULL) {
+                /* peer table takes my ref on peer */
+                list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
+        } else {
+                kib_peer_decref (peer);
+                peer = peer2;
+        }
+
+        kib_peer_addref(peer); /* +1 ref for conn */
+        peer->ibp_connecting++;
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        conn->ibc_peer = peer;
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
+        /* conn->ibc_cep is set when cm_accept is called */
+        conn->ibc_incarnation = incarnation;
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+        *connp = conn;
+        return (0);
+}
+
+static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state)
+{
+        IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,};
+        FSTATUS frc;
+
+        modify_attr.RequestState = state;
+
+        frc = iibt_qp_modify(qp, &modify_attr, NULL);
+        if (frc != FSUCCESS)
+                CERROR("couldn't set qp state to %d, error %d\n", state, frc);
+}
+
+static void kibnal_flush_pending(kib_conn_t *conn)
+{
+        LIST_HEAD        (zombies); 
+        struct list_head *tmp;
+        struct list_head *nxt;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               done;
+
+        /* NB we wait until the connection has closed before completing
+         * outstanding passive RDMAs so we can be sure the network can't 
+         * touch the mapped memory any more. */
+        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
+
+        /* set the QP to the error state so that we get flush callbacks
+         * on our posted receives which can then drop their conn refs */
+        kibnal_set_qp_state(conn->ibc_qp, QPStateError);
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        /* grab passive RDMAs not waiting for the tx callback */
+        list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                /* still waiting for tx callback? */
+                if (!tx->tx_passive_rdma_wait)
+                        continue;
+
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_passive_rdma_wait = 0;
+                done = (tx->tx_sending == 0);
+
+                if (!done)
+                        continue;
+
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+
+        /* grab all blocked transmits */
+        list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+                
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+        
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        while (!list_empty(&zombies)) {
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                list_del(&tx->tx_list);
+                kibnal_tx_done (tx);
+        }
+}
+
+static void
+kibnal_reject (IB_HANDLE cep, uint16_t reason)
+{
+        CM_REJECT_INFO *rej;
+
+        PORTAL_ALLOC(rej, sizeof(*rej));
+        if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */
+                return;  
+
+        rej->Reason = reason;
+        iibt_cm_reject(cep, rej);
+        PORTAL_FREE(rej, sizeof(*rej));
+}
+
+static FSTATUS
+kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, 
+              IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn)
+{
+        IB_QP_ATTRIBUTES_MODIFY modify_attr;
+        FSTATUS frc;
+        ENTRY;
+
+        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState           = QPStateReadyToRecv,
+                .RecvPSN                = IBNAL_STARTING_PSN,
+                .DestQPNumber           = qpn,
+                .ResponderResources     = resp_res,
+                .MinRnrTimer            = UsecToRnrNakTimer(2000), /* 20 ms */
+                .Attrs                  = (IB_QP_ATTR_RECVPSN |
+                                           IB_QP_ATTR_DESTQPNUMBER | 
+                                           IB_QP_ATTR_RESPONDERRESOURCES | 
+                                           IB_QP_ATTR_DESTAV | 
+                                           IB_QP_ATTR_PATHMTU | 
+                                           IB_QP_ATTR_MINRNRTIMER),
+        };
+        GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, 
+                      &modify_attr.DestAV);
+
+        frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+        if (frc != FSUCCESS) 
+                RETURN(frc);
+
+        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState           = QPStateReadyToSend,
+                .FlowControl            = TRUE,
+                .InitiatorDepth         = init_depth,
+                .SendPSN                = send_psn,
+                .LocalAckTimeout        = path->PktLifeTime + 2, /* 2 or 1? */
+                .RetryCount             = IBNAL_RETRY,
+                .RnrRetryCount          = IBNAL_RNR_RETRY,
+                .Attrs                  = (IB_QP_ATTR_FLOWCONTROL | 
+                                           IB_QP_ATTR_INITIATORDEPTH | 
+                                           IB_QP_ATTR_SENDPSN | 
+                                           IB_QP_ATTR_LOCALACKTIMEOUT | 
+                                           IB_QP_ATTR_RETRYCOUNT | 
+                                           IB_QP_ATTR_RNRRETRYCOUNT),
+        };
+
+        frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+        RETURN(frc);
+}
+
+static void
+kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+        kib_conn_t *conn = arg;
+        kib_wire_connreq_t *wcr;
+        CM_REPLY_INFO *rep = &info->Info.Reply;
+        uint16_t reason;
+        FSTATUS frc;
+
+        wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData;
+
+        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+                CERROR ("Can't connect "LPX64": bad magic %08x\n",
+                        conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+        
+        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+                CERROR ("Can't connect "LPX64": bad version %d\n",
+                        conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+                        
+        if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
+                CERROR ("Can't connect "LPX64": bad queue depth %d\n",
+                        conn->ibc_peer->ibp_nid, 
+                        le16_to_cpu(wcr->wcr_queue_depth));
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+                        
+        if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
+                CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
+                        le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
+                GOTO(reject, reason = RC_USER_REJ);
+        }
+
+        CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+        conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+        frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, 
+                            min_t(__u8, rep->ArbInitiatorDepth,
+                                  ca_attr->MaxQPResponderResources),
+                            &conn->ibc_connreq->cr_path, 
+                            min_t(__u8, rep->ArbResponderResources,
+                                  ca_attr->MaxQPInitiatorDepth),
+                            rep->StartingPSN);
+        if (frc != FSUCCESS) {
+                CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n",
+                       conn, conn->ibc_peer->ibp_nid, frc);
+                GOTO(reject, reason = RC_NO_QP);
+        }
+
+        /* the callback arguments are ignored for an active accept */
+        conn->ibc_connreq->cr_discarded.Status = FSUCCESS;
+        frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, 
+                             NULL, NULL, NULL, NULL);
+        if (frc != FCM_CONNECT_ESTABLISHED) {
+                CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n",
+                       conn, conn->ibc_peer->ibp_nid, frc);
+                kibnal_connreq_done (conn, 1, -ECONNABORTED);
+                /* XXX don't call reject after accept fails? */
+                return;
+        }
+
+        CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+        kibnal_connreq_done (conn, 1, 0);
+        return;
+
+reject:
+        kibnal_reject(cep, reason);
+        kibnal_connreq_done (conn, 1, -EPROTO);
+}
+
+/* ib_cm.h has a wealth of information on the CM procedures */
+static void
+kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        kib_conn_t       *conn = arg;
+
+        CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+        /* Established Connection Notifier */
+        switch (info->Status) {
+        default:
+                CERROR("unknown status %d on Connection %p -> "LPX64"\n",
+                       info->Status, conn, conn->ibc_peer->ibp_nid);
+                LBUG();
+                break;
+
+        case FCM_CONNECT_REPLY:
+                kibnal_connect_reply(cep, info, arg);
+                break;
+
+        case FCM_DISCONNECT_REQUEST:
+                /* XXX lock around these state management bits? */
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+                        kibnal_close_conn (conn, 0);
+                conn->ibc_state = IBNAL_CONN_DREP;
+                iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+                break;
+
+        /* these both guarantee that no more cm callbacks will occur */
+        case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */
+        case FCM_DISCONNECT_REPLY:
+                CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n",
+                       conn, conn->ibc_peer->ibp_nid);
+
+                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+                kibnal_flush_pending(conn);
+                kibnal_put_conn(conn);        /* Lose CM's ref */
+                break;
+        }
+
+        return;
+}
+
+static int
+kibnal_set_cm_flags(IB_HANDLE cep)
+{
+        FSTATUS frc;
+        uint32 value = 1;
+
+        frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
+                                 (char *)&value, sizeof(value), 0);
+        if (frc != FSUCCESS) {
+                CERROR("error setting timeout callback: %d\n", frc);
+                return -1;
+        }
+
+#if 0
+        frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
+                                 sizeof(value), 0);
+        if (frc != FSUCCESS) {
+                CERROR("error setting async accept: %d\n", frc);
+                return -1;
+        }
+#endif
+
+        return 0;
+}
+
+void
+kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+        IB_QP_ATTRIBUTES_QUERY *query;
+        CM_REQUEST_INFO    *req;
+        CM_CONN_INFO       *rep = NULL, *rcv = NULL;
+        kib_wire_connreq_t *wcr;
+        kib_conn_t         *conn = NULL;
+        uint16_t            reason = 0;
+        FSTATUS             frc;
+        int                 rc = 0;
+        
+        LASSERT(cep);
+        LASSERT(info);
+        LASSERT(arg == NULL); /* no conn yet for passive */
+
+        CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+        req = &info->Info.Request;
+        wcr = (kib_wire_connreq_t *)req->PrivateData;
+
+        CDEBUG(D_NET, "%d from "LPX64"\n", info->Status, 
+               le64_to_cpu(wcr->wcr_nid));
+        
+        if (info->Status == FCM_CONNECT_CANCEL)
+                return;
+        
+        LASSERT (info->Status == FCM_CONNECT_REQUEST);
+        
+        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+                CERROR ("Can't accept: bad magic %08x\n",
+                        le32_to_cpu(wcr->wcr_magic));
+                GOTO(out, reason = RC_USER_REJ);
+        }
+
+        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+                CERROR ("Can't accept: bad version %d\n",
+                        le16_to_cpu(wcr->wcr_magic));
+                GOTO(out, reason = RC_USER_REJ);
+        }
+
+        rc = kibnal_accept(&conn, cep,
+                           le64_to_cpu(wcr->wcr_nid),
+                           le64_to_cpu(wcr->wcr_incarnation),
+                           le16_to_cpu(wcr->wcr_queue_depth));
+        if (rc != 0) {
+                CERROR ("Can't accept "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), rc);
+                GOTO(out, reason = RC_NO_RESOURCES);
+        }
+
+        frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN,
+                            min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, 
+                                  ca_attr->MaxQPResponderResources),
+                            &req->PathInfo.Path,
+                            min_t(__u8, req->CEPInfo.OfferedResponderResources, 
+                                  ca_attr->MaxQPInitiatorDepth),
+                            req->CEPInfo.StartingPSN);
+
+        if (frc != FSUCCESS) {
+                CERROR ("Can't mark QP RTS/RTR  "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), frc);
+                GOTO(out, reason = RC_NO_QP);
+        }
+
+        frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Couldn't query qp attributes "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), frc);
+                GOTO(out, reason = RC_NO_QP);
+        }
+        query = &conn->ibc_qp_attrs;
+
+        PORTAL_ALLOC(rep, sizeof(*rep));
+        PORTAL_ALLOC(rcv, sizeof(*rcv));
+        if (rep == NULL || rcv == NULL) {
+                CERROR ("can't reply and receive buffers\n");
+                GOTO(out, reason = RC_INSUFFICIENT_RESP_RES);
+        }
+
+        /* don't try to deref this into the incoming wcr :) */
+        wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData;
+
+        rep->Info.Reply = (CM_REPLY_INFO) {
+                .QPN = query->QPNumber,
+                .QKey = query->Qkey,
+                .StartingPSN = query->RecvPSN,
+                .EndToEndFlowControl = query->FlowControl,
+                /* XXX Hmm. */
+                .ArbInitiatorDepth = query->InitiatorDepth,
+                .ArbResponderResources = query->ResponderResources,
+                .TargetAckDelay = 0,
+                .FailoverAccepted = 0,
+                .RnRRetryCount = req->CEPInfo.RnrRetryCount,
+        };
+                
+        *wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+        };
+
+        frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, 
+                             &conn->ibc_cep);
+
+        PORTAL_FREE(rep, sizeof(*rep));
+        PORTAL_FREE(rcv, sizeof(*rcv));
+
+        if (frc != FCM_CONNECT_ESTABLISHED) {
+                /* XXX it seems we don't call reject after this point? */
+                CERROR("iibt_cm_accept() failed: %d, aborting\n", frc);
+                rc = -ECONNABORTED;
+                goto out;
+        }
+
+        if (kibnal_set_cm_flags(conn->ibc_cep)) {
+                rc = -ECONNABORTED;
+                goto out;
+        }
+
+        CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+out:
+        if (reason) {
+                kibnal_reject(cep, reason);
+                rc = -ECONNABORTED;
+        }
+        if (conn != NULL) 
+                kibnal_connreq_done(conn, 0, rc);
+
+        return;
+}
+
+static void
+dump_path_records(PATH_RESULTS *results)
+{
+        IB_PATH_RECORD *path;
+        int i;
+
+        for(i = 0; i < results->NumPathRecords; i++) {
+                path = &results->PathRecords[i];
+                CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid "
+                       LPX64":"LPX64" pkey %x\n",
+                       i,
+                       path->SGID.Type.Global.SubnetPrefix,
+                       path->SGID.Type.Global.InterfaceID,
+                       path->DGID.Type.Global.SubnetPrefix,
+                       path->DGID.Type.Global.InterfaceID,
+                       path->P_Key);
+        }
+}
+
+static void
+kibnal_pathreq_callback (void *arg, QUERY *query, 
+                         QUERY_RESULT_VALUES *query_res)
+{
+        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+        kib_conn_t *conn = arg;
+        PATH_RESULTS *path;
+        FSTATUS frc;
+        
+        if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+                CERROR ("status %d data size %d\n", query_res->Status,
+                        query_res->ResultDataSize);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        path = (PATH_RESULTS *)query_res->QueryResult;
+
+        if (path->NumPathRecords < 1) {
+                CERROR ("expected path records: %d\n", path->NumPathRecords);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        dump_path_records(path);
+
+        /* just using the first.  this is probably a horrible idea. */
+        conn->ibc_connreq->cr_path = path->PathRecords[0];
+
+        conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE);
+        if (conn->ibc_cep == NULL) {
+                CERROR ("Can't create CEP\n");
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        if (kibnal_set_cm_flags(conn->ibc_cep)) {
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+        };
+
+        conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) {
+                .SID = conn->ibc_connreq->cr_service.RID.ServiceID,
+                .CEPInfo = (CM_CEP_INFO) { 
+                        .CaGUID = kibnal_data.kib_hca_guids[0],
+                        .EndToEndFlowControl = FALSE,
+                        .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID,
+                        .RetryCount = IBNAL_RETRY,
+                        .RnrRetryCount = IBNAL_RNR_RETRY,
+                        .AckTimeout = IBNAL_ACK_TIMEOUT,
+                        .StartingPSN = IBNAL_STARTING_PSN,
+                        .QPN = conn->ibc_qp_attrs.QPNumber,
+                        .QKey = conn->ibc_qp_attrs.Qkey,
+                        .OfferedResponderResources = ca_attr->MaxQPResponderResources,
+                        .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth,
+                },
+                .PathInfo = (CM_CEP_PATHINFO) {
+                        .bSubnetLocal = TRUE,
+                        .Path = conn->ibc_connreq->cr_path,
+                },
+        };
+
+#if 0
+        /* XXX set timeout just like SDP!!!*/
+        conn->ibc_connreq->cr_path.packet_life = 13;
+#endif
+        /* Flag I'm getting involved with the CM... */
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
+
+        CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
+               conn->ibc_connreq->cr_service.RID.ServiceID, 
+               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+        memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, 
+               CM_REQUEST_INFO_USER_LEN);
+        memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, 
+               &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
+
+        /* kibnal_cm_callback gets my conn ref */
+        frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq,
+                              kibnal_cm_callback, conn);
+        if (frc != FPENDING && frc != FSUCCESS) {
+                CERROR ("Connect: %d\n", frc);
+                /* Back out state change as connect failed */
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
+                kibnal_connreq_done (conn, 1, -EINVAL);
+        }
+}
+
+static void
+dump_service_records(SERVICE_RECORD_RESULTS *results)
+{
+        IB_SERVICE_RECORD *svc;
+        int i;
+
+        for(i = 0; i < results->NumServiceRecords; i++) {
+                svc = &results->ServiceRecords[i];
+                CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
+                       i,
+                       svc->RID.ServiceID,
+                       svc->RID.ServiceGID.Type.Global.SubnetPrefix,
+                       svc->RID.ServiceGID.Type.Global.InterfaceID,
+                       svc->RID.ServiceP_Key);
+        }
+}
+
+
+static void
+kibnal_service_get_callback (void *arg, QUERY *query, 
+                             QUERY_RESULT_VALUES *query_res)
+{
+        kib_conn_t *conn = arg;
+        SERVICE_RECORD_RESULTS *svc;
+        COMMAND_CONTROL_PARAMETERS sd_params;
+        QUERY   path_query;
+        FSTATUS frc;
+        
+        if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+                CERROR ("status %d data size %d\n", query_res->Status,
+                        query_res->ResultDataSize);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult;
+
+        if (svc->NumServiceRecords < 1) {
+                CERROR ("%d service records\n", svc->NumServiceRecords);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        dump_service_records(svc);
+
+        conn->ibc_connreq->cr_service = svc->ServiceRecords[0];
+
+        CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
+               query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, 
+               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+        memset(&path_query, 0, sizeof(path_query));
+        path_query.InputType = InputTypePortGuidPair;
+        path_query.OutputType = OutputTypePathRecord;
+        path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid;
+        path_query.InputValue.PortGuidPair.DestPortGuid  = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID;
+
+        memset(&sd_params, 0, sizeof(sd_params));
+        sd_params.RetryCount = IBNAL_RETRY;
+        sd_params.Timeout = 10 * 1000;   /* wait 10 seconds */
+
+        /* kibnal_service_get_callback gets my conn ref */
+
+        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+                                                    kibnal_data.kib_port_guid,
+                                                    &path_query, 
+                                                    kibnal_pathreq_callback,
+                                                    &sd_params, conn);
+        if (frc == FPENDING)
+                return;
+
+        CERROR ("Path record request failed: %d\n", frc);
+        kibnal_connreq_done (conn, 1, -EINVAL);
+}
+
+static void
+kibnal_connect_peer (kib_peer_t *peer)
+{
+        COMMAND_CONTROL_PARAMETERS sd_params;
+        QUERY   query;
+        FSTATUS frc;
+        kib_conn_t  *conn = kibnal_create_conn();
+
+        LASSERT (peer->ibp_connecting != 0);
+
+        if (conn == NULL) {
+                CERROR ("Can't allocate conn\n");
+                kibnal_peer_connect_failed (peer, 1, -ENOMEM);
+                return;
+        }
+
+        conn->ibc_peer = peer;
+        kib_peer_addref(peer);
+
+        PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+        if (conn->ibc_connreq == NULL) {
+                CERROR ("Can't allocate connreq\n");
+                kibnal_connreq_done (conn, 1, -ENOMEM);
+                return;
+        }
+
+        memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+
+        kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+
+        memset(&query, 0, sizeof(query));
+        query.InputType = InputTypeServiceRecord;
+        query.OutputType = OutputTypeServiceRecord;
+        query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service;
+        query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+
+        memset(&sd_params, 0, sizeof(sd_params));
+        sd_params.RetryCount = IBNAL_RETRY;
+        sd_params.Timeout = 10 * 1000;   /* wait 10 seconds */
+
+        /* kibnal_service_get_callback gets my conn ref */
+        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+                                                    kibnal_data.kib_port_guid,
+                                                    &query, 
+                                                kibnal_service_get_callback, 
+                                                    &sd_params, conn);
+        if (frc == FPENDING)
+                return;
+
+        CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc);
+        kibnal_connreq_done (conn, 1, frc);
+}
+
+static int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+        kib_tx_t          *tx;
+        struct list_head  *ttmp;
+        unsigned long      flags;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        list_for_each (ttmp, &conn->ibc_tx_queue) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_sending == 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        return 0;
+}
+
+static void
+kibnal_check_conns (int idx)
+{
+        struct list_head  *peers = &kibnal_data.kib_peers[idx];
+        struct list_head  *ptmp;
+        kib_peer_t        *peer;
+        kib_conn_t        *conn;
+        struct list_head  *ctmp;
+
+ again:
+        /* NB. We expect to have a look at all the peers and not find any
+         * rdmas to time out, so we just use a shared lock while we
+         * take a look... */
+        read_lock (&kibnal_data.kib_global_lock);
+
+        list_for_each (ptmp, peers) {
+                peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+                list_for_each (ctmp, &peer->ibp_conns) {
+                        conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+
+                        /* In case we have enough credits to return via a
+                         * NOOP, but there were no non-blocking tx descs
+                         * free to do it last time... */
+                        kibnal_check_sends(conn);
+
+                        if (!kibnal_conn_timed_out(conn))
+                                continue;
+                        
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+
+                        atomic_inc (&conn->ibc_refcount);
+                        read_unlock (&kibnal_data.kib_global_lock);
+
+                        CERROR("Timed out RDMA with "LPX64"\n",
+                               peer->ibp_nid);
+
+                        kibnal_close_conn (conn, -ETIMEDOUT);
+                        kibnal_put_conn (conn);
+
+                        /* start again now I've dropped the lock */
+                        goto again;
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+}
+
+static void
+kib_connd_handle_state(kib_conn_t *conn)
+{
+        FSTATUS frc;
+
+        switch (conn->ibc_state) {
+                /* all refs have gone, free and be done with it */ 
+                case IBNAL_CONN_DISCONNECTED:
+                        kibnal_destroy_conn (conn);
+                        return; /* avoid put_conn */
+
+                case IBNAL_CONN_SEND_DREQ:
+                        frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+                        if (frc != FSUCCESS) /* XXX do real things */
+                                CERROR("disconnect failed: %d\n", frc);
+                        conn->ibc_state = IBNAL_CONN_DREQ;
+                        break;
+
+                /* a callback got to the conn before we did */ 
+                case IBNAL_CONN_DREP:
+                        break;
+                                
+                default:
+                        CERROR ("Bad conn %p state: %d\n", conn, 
+                                conn->ibc_state);
+                        LBUG();
+                        break;
+        }
+
+        /* drop ref from close_conn */
+        kibnal_put_conn(conn);
+}
+
+int
+kibnal_connd (void *arg)
+{
+        wait_queue_t       wait;
+        unsigned long      flags;
+        kib_conn_t        *conn;
+        kib_peer_t        *peer;
+        int                timeout;
+        int                i;
+        int                peer_index = 0;
+        unsigned long      deadline = jiffies;
+        
+        kportal_daemonize ("kibnal_connd");
+        kportal_blockallsigs ();
+
+        init_waitqueue_entry (&wait, current);
+
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+        for (;;) {
+                if (!list_empty (&kibnal_data.kib_connd_conns)) {
+                        conn = list_entry (kibnal_data.kib_connd_conns.next,
+                                           kib_conn_t, ibc_list);
+                        list_del (&conn->ibc_list);
+                        
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+                        kib_connd_handle_state(conn);
+
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                        continue;
+                }
+
+                if (!list_empty (&kibnal_data.kib_connd_peers)) {
+                        peer = list_entry (kibnal_data.kib_connd_peers.next,
+                                           kib_peer_t, ibp_connd_list);
+                        
+                        list_del_init (&peer->ibp_connd_list);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+                        kibnal_connect_peer (peer);
+                        kib_peer_decref (peer);
+
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                }
+
+                /* shut down and nobody left to reap... */
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
+                        break;
+
+                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+                /* careful with the jiffy wrap... */
+                while ((timeout = (int)(deadline - jiffies)) <= 0) {
+                        const int n = 4;
+                        const int p = 1;
+                        int       chunk = kibnal_data.kib_peer_hash_size;
+                        
+                        /* Time to check for RDMA timeouts on a few more
+                         * peers: I do checks every 'p' seconds on a
+                         * proportion of the peer table and I need to check
+                         * every connection 'n' times within a timeout
+                         * interval, to ensure I detect a timeout on any
+                         * connection within (n+1)/n times the timeout
+                         * interval. */
+
+                        if (kibnal_tunables.kib_io_timeout > n * p)
+                                chunk = (chunk * n * p) / 
+                                        kibnal_tunables.kib_io_timeout;
+                        if (chunk == 0)
+                                chunk = 1;
+
+                        for (i = 0; i < chunk; i++) {
+                                kibnal_check_conns (peer_index);
+                                peer_index = (peer_index + 1) % 
+                                             kibnal_data.kib_peer_hash_size;
+                        }
+
+                        deadline += p * HZ;
+                }
+
+                kibnal_data.kib_connd_waketime = jiffies + timeout;
+
+                set_current_state (TASK_INTERRUPTIBLE);
+                add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+                if (!kibnal_data.kib_shutdown &&
+                    list_empty (&kibnal_data.kib_connd_conns) &&
+                    list_empty (&kibnal_data.kib_connd_peers))
+                        schedule_timeout (timeout);
+
+                set_current_state (TASK_RUNNING);
+                remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+                spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+        kibnal_thread_fini ();
+        return (0);
+}
+
+int
+kibnal_scheduler(void *arg)
+{
+        long            id = (long)arg;
+        char            name[16];
+        kib_rx_t       *rx;
+        kib_tx_t       *tx;
+        unsigned long   flags;
+        int             rc;
+        int             counter = 0;
+        int             did_something;
+
+        snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
+        kportal_daemonize(name);
+        kportal_blockallsigs();
+
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+
+        for (;;) {
+                did_something = 0;
+
+                while (!list_empty(&kibnal_data.kib_sched_txq)) {
+                        tx = list_entry(kibnal_data.kib_sched_txq.next,
+                                        kib_tx_t, tx_list);
+                        list_del(&tx->tx_list);
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+                        kibnal_tx_done(tx);
+
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+
+                if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+                        rx = list_entry(kibnal_data.kib_sched_rxq.next,
+                                        kib_rx_t, rx_list);
+                        list_del(&rx->rx_list);
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+
+                        kibnal_rx(rx);
+
+                        did_something = 1;
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+
+                /* shut down and no receives to complete... */
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
+                        break;
+
+                /* nothing to do or hogging CPU */
+                if (!did_something || counter++ == IBNAL_RESCHED) {
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+                        counter = 0;
+
+                        if (!did_something) {
+                                rc = wait_event_interruptible(
+                                        kibnal_data.kib_sched_waitq,
+                                        !list_empty(&kibnal_data.kib_sched_txq) || 
+                                        !list_empty(&kibnal_data.kib_sched_rxq) || 
+                                        (kibnal_data.kib_shutdown &&
+                                         atomic_read (&kibnal_data.kib_nconns) == 0));
+                        } else {
+                                our_cond_resched();
+                        }
+
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+        }
+
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+
+        kibnal_thread_fini();
+        return (0);
+}
+
+
+lib_nal_t kibnal_lib = {
+        libnal_data:        &kibnal_data,      /* NAL private data */
+        libnal_send:         kibnal_send,
+        libnal_send_pages:   kibnal_send_pages,
+        libnal_recv:         kibnal_recv,
+        libnal_recv_pages:   kibnal_recv_pages,
+        libnal_dist:         kibnal_dist
+};
diff --git a/lustre/portals/knals/openibnal/.cvsignore b/lustre/portals/knals/openibnal/.cvsignore
new file mode 100644 (file)
index 0000000..5ed596b
--- /dev/null
@@ -0,0 +1,10 @@
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
index 6f66143..652eb34 100644 (file)
 
 #include "openibnal.h"
 
-nal_t                   koibnal_api;
-ptl_handle_ni_t         koibnal_ni;
-koib_data_t             koibnal_data;
-koib_tunables_t         koibnal_tunables;
+nal_t                   kibnal_api;
+ptl_handle_ni_t         kibnal_ni;
+kib_data_t              kibnal_data;
+kib_tunables_t          kibnal_tunables;
 
 #ifdef CONFIG_SYSCTL
-#define OPENIBNAL_SYSCTL        202
+#define IBNAL_SYSCTL             202
 
-#define OPENIBNAL_SYSCTL_TIMEOUT     1
-#define OPENIBNAL_SYSCTL_ZERO_COPY   2
+#define IBNAL_SYSCTL_TIMEOUT     1
 
-static ctl_table koibnal_ctl_table[] = {
-        {OPENIBNAL_SYSCTL_TIMEOUT, "timeout", 
-         &koibnal_tunables.koib_io_timeout, sizeof (int),
+static ctl_table kibnal_ctl_table[] = {
+        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
+         &kibnal_tunables.kib_io_timeout, sizeof (int),
          0644, NULL, &proc_dointvec},
         { 0 }
 };
 
-static ctl_table koibnal_top_ctl_table[] = {
-        {OPENIBNAL_SYSCTL, "openibnal", NULL, 0, 0555, koibnal_ctl_table},
+static ctl_table kibnal_top_ctl_table[] = {
+        {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
         { 0 }
 };
 #endif
@@ -66,167 +65,183 @@ print_service(struct ib_common_attrib_service *service, char *tag, int rc)
               "service id: "LPX64"\n"
               "name      : %s\n"
               "NID       : "LPX64"\n", tag, rc,
-              service->service_id, name, service->service_data64[0]);
+              service->service_id, name, 
+              *kibnal_service_nid_field(service));
 }
 
 void
-koibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
+kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
                                struct ib_common_attrib_service *service, void *arg)
 {
         *(int *)arg = status;
-        up (&koibnal_data.koib_nid_signal);
+        up (&kibnal_data.kib_nid_signal);
 }
 
+#if IBNAL_CHECK_ADVERT
+void
+kibnal_check_advert (void)
+{
+        struct ib_common_attrib_service *svc;
+        __u64   tid;
+        int     rc;
+        int     rc2;
+
+        PORTAL_ALLOC(svc, sizeof(*svc));
+        if (svc == NULL)
+                return;
+
+        memset (svc, 0, sizeof (*svc));
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+        rc = ib_service_get (kibnal_data.kib_device, 
+                             kibnal_data.kib_port,
+                             svc,
+                             KIBNAL_SERVICE_KEY_MASK,
+                             kibnal_tunables.kib_io_timeout * HZ,
+                             kibnal_service_setunset_done, &rc2, 
+                             &tid);
+
+        if (rc != 0) {
+                CERROR ("Immediate error %d checking SM service\n", rc);
+        } else {
+                down (&kibnal_data.kib_nid_signal);
+                rc = rc2;
+
+                if (rc != 0)
+                        CERROR ("Error %d checking SM service\n", rc);
+        }
+
+        PORTAL_FREE(svc, sizeof(*svc));
+}
+#endif
+
 int
-koibnal_advertise (void)
+kibnal_advertise (void)
 {
+        struct ib_common_attrib_service *svc;
         __u64   tid;
         int     rc;
         int     rc2;
 
-        LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC(svc, sizeof(*svc));
+        if (svc == NULL)
+                return (-ENOMEM);
 
-        memset (&koibnal_data.koib_service, 0, 
-                sizeof (koibnal_data.koib_service));
+        memset (svc, 0, sizeof (*svc));
         
-        koibnal_data.koib_service.service_id
-                = koibnal_data.koib_cm_service_id;
+        svc->service_id = kibnal_data.kib_service_id;
 
-        rc = ib_cached_gid_get(koibnal_data.koib_device,
-                               koibnal_data.koib_port,
+        rc = ib_cached_gid_get(kibnal_data.kib_device,
+                               kibnal_data.kib_port,
                                0,
-                               koibnal_data.koib_service.service_gid);
+                               svc->service_gid);
         if (rc != 0) {
                 CERROR ("Can't get port %d GID: %d\n",
-                        koibnal_data.koib_port, rc);
-                return (rc);
+                        kibnal_data.kib_port, rc);
+                goto out;
         }
         
-        rc = ib_cached_pkey_get(koibnal_data.koib_device,
-                                koibnal_data.koib_port,
+        rc = ib_cached_pkey_get(kibnal_data.kib_device,
+                                kibnal_data.kib_port,
                                 0,
-                                &koibnal_data.koib_service.service_pkey);
+                                &svc->service_pkey);
         if (rc != 0) {
                 CERROR ("Can't get port %d PKEY: %d\n",
-                        koibnal_data.koib_port, rc);
-                return (rc);
+                        kibnal_data.kib_port, rc);
+                goto out;
         }
         
-        koibnal_data.koib_service.service_lease = 0xffffffff;
+        svc->service_lease = 0xffffffff;
 
-        koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
 
         CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
-               koibnal_data.koib_service.service_id,
-               koibnal_data.koib_service.service_name, 
-               *koibnal_service_nid_field(&koibnal_data.koib_service));
+               svc->service_id, 
+               svc->service_name, *kibnal_service_nid_field(svc));
 
-        rc = ib_service_set (koibnal_data.koib_device,
-                             koibnal_data.koib_port,
-                             &koibnal_data.koib_service,
+        rc = ib_service_set (kibnal_data.kib_device,
+                             kibnal_data.kib_port,
+                             svc,
                              IB_SA_SERVICE_COMP_MASK_ID |
                              IB_SA_SERVICE_COMP_MASK_GID |
                              IB_SA_SERVICE_COMP_MASK_PKEY |
                              IB_SA_SERVICE_COMP_MASK_LEASE |
-                             KOIBNAL_SERVICE_KEY_MASK,
-                             koibnal_tunables.koib_io_timeout * HZ,
-                             koibnal_service_setunset_done, &rc2, &tid);
+                             KIBNAL_SERVICE_KEY_MASK,
+                             kibnal_tunables.kib_io_timeout * HZ,
+                             kibnal_service_setunset_done, &rc2, &tid);
 
-        if (rc == 0) {
-                down (&koibnal_data.koib_nid_signal);
-                rc = rc2;
+        if (rc != 0) {
+                CERROR ("Immediate error %d advertising NID "LPX64"\n",
+                        rc, kibnal_data.kib_nid);
+                goto out;
         }
-        
-        if (rc != 0)
-                CERROR ("Error %d advertising SM service\n", rc);
 
+        down (&kibnal_data.kib_nid_signal);
+
+        rc = rc2;
+        if (rc != 0)
+                CERROR ("Error %d advertising NID "LPX64"\n", 
+                        rc, kibnal_data.kib_nid);
+ out:
+        PORTAL_FREE(svc, sizeof(*svc));
         return (rc);
 }
 
-int
-koibnal_unadvertise (int expect_success)
+void
+kibnal_unadvertise (int expect_success)
 {
+        struct ib_common_attrib_service *svc;
         __u64   tid;
         int     rc;
         int     rc2;
 
-        LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
 
-        memset (&koibnal_data.koib_service, 0,
-                sizeof (koibnal_data.koib_service));
+        PORTAL_ALLOC(svc, sizeof(*svc));
+        if (svc == NULL)
+                return;
 
-        koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
+        memset (svc, 0, sizeof(*svc));
+
+        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
 
         CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
-               koibnal_data.koib_service.service_name,
-               *koibnal_service_nid_field(&koibnal_data.koib_service));
-
-        rc = ib_service_delete (koibnal_data.koib_device,
-                                koibnal_data.koib_port,
-                                &koibnal_data.koib_service,
-                                KOIBNAL_SERVICE_KEY_MASK,
-                                koibnal_tunables.koib_io_timeout * HZ,
-                                koibnal_service_setunset_done, &rc2, &tid);
+               svc->service_name, *kibnal_service_nid_field(svc));
+
+        rc = ib_service_delete (kibnal_data.kib_device,
+                                kibnal_data.kib_port,
+                                svc,
+                                KIBNAL_SERVICE_KEY_MASK,
+                                kibnal_tunables.kib_io_timeout * HZ,
+                                kibnal_service_setunset_done, &rc2, &tid);
         if (rc != 0) {
                 CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
-                        rc, koibnal_data.koib_nid);
-                return (rc);
+                        rc, kibnal_data.kib_nid);
+                goto out;
         }
 
-        down (&koibnal_data.koib_nid_signal);
+        down (&kibnal_data.kib_nid_signal);
         
         if ((rc2 == 0) == !!expect_success)
-                return (0);
+                goto out;                       /* success: rc == 0 */
 
         if (expect_success)
                 CERROR("Error %d unadvertising NID "LPX64"\n",
-                        rc, koibnal_data.koib_nid);
+                       rc, kibnal_data.kib_nid);
         else
                 CWARN("Removed conflicting NID "LPX64"\n",
-                      koibnal_data.koib_nid);
-
-        return (rc);
-}
-
-int
-koibnal_check_advert (void)
-{
-        __u64   tid;
-        int     rc;
-        int     rc2;
-
-        static struct ib_common_attrib_service srv;
-
-        memset (&srv, 0, sizeof (srv));
-
-        koibnal_set_service_keys(&srv, koibnal_data.koib_nid);
-
-        rc = ib_service_get (koibnal_data.koib_device, 
-                             koibnal_data.koib_port,
-                             &srv,
-                             KOIBNAL_SERVICE_KEY_MASK,
-                             koibnal_tunables.koib_io_timeout * HZ,
-                             koibnal_service_setunset_done, &rc2, 
-                             &tid);
-
-        if (rc != 0) {
-                CERROR ("Immediate error %d checking SM service\n", rc);
-        } else {
-                down (&koibnal_data.koib_nid_signal);
-                rc = rc2;
-
-                if (rc != 0)
-                        CERROR ("Error %d checking SM service\n", rc);
-        }
-
-        return (rc);
+                      kibnal_data.kib_nid);
+ out:
+        PORTAL_FREE(svc, sizeof(*svc));
 }
 
 int
-koibnal_set_mynid(ptl_nid_t nid)
+kibnal_set_mynid(ptl_nid_t nid)
 {
         struct timeval tv;
-        lib_ni_t      *ni = &koibnal_lib.libnal_ni;
+        lib_ni_t      *ni = &kibnal_lib.libnal_ni;
         int            rc;
 
         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
@@ -234,75 +249,76 @@ koibnal_set_mynid(ptl_nid_t nid)
 
         do_gettimeofday(&tv);
 
-        down (&koibnal_data.koib_nid_mutex);
+        down (&kibnal_data.kib_nid_mutex);
 
-        if (nid == koibnal_data.koib_nid) {
+        if (nid == kibnal_data.kib_nid) {
                 /* no change of NID */
-                up (&koibnal_data.koib_nid_mutex);
+                up (&kibnal_data.kib_nid_mutex);
                 return (0);
         }
 
         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
-               koibnal_data.koib_nid, nid);
+               kibnal_data.kib_nid, nid);
         
-        if (koibnal_data.koib_nid != PTL_NID_ANY) {
+        if (kibnal_data.kib_nid != PTL_NID_ANY) {
 
-                koibnal_unadvertise (1);
+                kibnal_unadvertise (1);
 
-                rc = ib_cm_listen_stop (koibnal_data.koib_listen_handle);
+                rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
                 if (rc != 0)
                         CERROR ("Error %d stopping listener\n", rc);
         }
         
-        koibnal_data.koib_nid = ni->ni_pid.nid = nid;
-        koibnal_data.koib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+        kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
         
         /* Delete all existing peers and their connections after new
          * NID/incarnation set to ensure no old connections in our brave
          * new world. */
-        koibnal_del_peer (PTL_NID_ANY, 0);
-
-        rc = 0;
-        if (koibnal_data.koib_nid != PTL_NID_ANY) {
-                /* New NID installed */
+        kibnal_del_peer (PTL_NID_ANY, 0);
 
-                /* remove any previous advert (crashed node etc) */
-                koibnal_unadvertise(0);
+        if (kibnal_data.kib_nid == PTL_NID_ANY) {
+                /* No new NID to install */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+        
+        /* remove any previous advert (crashed node etc) */
+        kibnal_unadvertise(0);
 
-                /* Assign new service number */
-                koibnal_data.koib_cm_service_id = ib_cm_service_assign();
-                CDEBUG(D_NET, "service_id "LPX64"\n", koibnal_data.koib_cm_service_id);
+        /* Assign new service number */
+        kibnal_data.kib_service_id = ib_cm_service_assign();
+        CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id);
         
-                rc = ib_cm_listen(koibnal_data.koib_cm_service_id,
-                                  TS_IB_CM_SERVICE_EXACT_MASK,
-                                  koibnal_passive_conn_callback, NULL,
-                                  &koibnal_data.koib_listen_handle);
-                if (rc != 0) {
-                        CERROR ("ib_cm_listen error: %d\n", rc);
-                        goto out;
+        rc = ib_cm_listen(kibnal_data.kib_service_id,
+                          TS_IB_CM_SERVICE_EXACT_MASK,
+                          kibnal_passive_conn_callback, NULL,
+                          &kibnal_data.kib_listen_handle);
+        if (rc == 0) {
+                rc = kibnal_advertise();
+                if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+                        kibnal_check_advert();
+#endif
+                        up (&kibnal_data.kib_nid_mutex);
+                        return (0);
                 }
 
-                rc = koibnal_advertise();
-
-                koibnal_check_advert();
-        }
-        
- out:
-        if (rc != 0) {
-                koibnal_data.koib_nid = PTL_NID_ANY;
+                ib_cm_listen_stop(kibnal_data.kib_listen_handle);
                 /* remove any peers that sprung up while I failed to
                  * advertise myself */
-                koibnal_del_peer (PTL_NID_ANY, 0);
+                kibnal_del_peer (PTL_NID_ANY, 0);
         }
-
-        up (&koibnal_data.koib_nid_mutex);
-        return (0);
+        
+        kibnal_data.kib_nid = PTL_NID_ANY;
+        up (&kibnal_data.kib_nid_mutex);
+        return (rc);
 }
 
-koib_peer_t *
-koibnal_create_peer (ptl_nid_t nid)
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
 {
-        koib_peer_t *peer;
+        kib_peer_t *peer;
 
         LASSERT (nid != PTL_NID_ANY);
 
@@ -320,20 +336,20 @@ koibnal_create_peer (ptl_nid_t nid)
         INIT_LIST_HEAD (&peer->ibp_tx_queue);
 
         peer->ibp_reconnect_time = jiffies;
-        peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
+        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
 
-        atomic_inc (&koibnal_data.koib_npeers);
+        atomic_inc (&kibnal_data.kib_npeers);
         return (peer);
 }
 
 void
-koibnal_destroy_peer (koib_peer_t *peer)
+kibnal_destroy_peer (kib_peer_t *peer)
 {
         CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
 
         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
         LASSERT (peer->ibp_persistence == 0);
-        LASSERT (!koibnal_peer_active(peer));
+        LASSERT (!kibnal_peer_active(peer));
         LASSERT (peer->ibp_connecting == 0);
         LASSERT (list_empty (&peer->ibp_conns));
         LASSERT (list_empty (&peer->ibp_tx_queue));
@@ -344,11 +360,11 @@ koibnal_destroy_peer (koib_peer_t *peer)
          * they are destroyed, so we can be assured that _all_ state to do
          * with this peer has been cleaned up when its refcount drops to
          * zero. */
-        atomic_dec (&koibnal_data.koib_npeers);
+        atomic_dec (&kibnal_data.kib_npeers);
 }
 
 void
-koibnal_put_peer (koib_peer_t *peer)
+kibnal_put_peer (kib_peer_t *peer)
 {
         CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
                 peer, peer->ibp_nid,
@@ -358,19 +374,19 @@ koibnal_put_peer (koib_peer_t *peer)
         if (!atomic_dec_and_test (&peer->ibp_refcount))
                 return;
 
-        koibnal_destroy_peer (peer);
+        kibnal_destroy_peer (peer);
 }
 
-koib_peer_t *
-koibnal_find_peer_locked (ptl_nid_t nid)
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
 {
-        struct list_head *peer_list = koibnal_nid2peerlist (nid);
+        struct list_head *peer_list = kibnal_nid2peerlist (nid);
         struct list_head *tmp;
-        koib_peer_t      *peer;
+        kib_peer_t       *peer;
 
         list_for_each (tmp, peer_list) {
 
-                peer = list_entry (tmp, koib_peer_t, ibp_list);
+                peer = list_entry (tmp, kib_peer_t, ibp_list);
 
                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
                          peer->ibp_connecting != 0 || /* creating conns */
@@ -386,46 +402,46 @@ koibnal_find_peer_locked (ptl_nid_t nid)
         return (NULL);
 }
 
-koib_peer_t *
-koibnal_get_peer (ptl_nid_t nid)
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
 {
-        koib_peer_t     *peer;
+        kib_peer_t     *peer;
 
-        read_lock (&koibnal_data.koib_global_lock);
-        peer = koibnal_find_peer_locked (nid);
+        read_lock (&kibnal_data.kib_global_lock);
+        peer = kibnal_find_peer_locked (nid);
         if (peer != NULL)                       /* +1 ref for caller? */
                 atomic_inc (&peer->ibp_refcount);
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
 
         return (peer);
 }
 
 void
-koibnal_unlink_peer_locked (koib_peer_t *peer)
+kibnal_unlink_peer_locked (kib_peer_t *peer)
 {
         LASSERT (peer->ibp_persistence == 0);
         LASSERT (list_empty(&peer->ibp_conns));
 
-        LASSERT (koibnal_peer_active(peer));
+        LASSERT (kibnal_peer_active(peer));
         list_del_init (&peer->ibp_list);
         /* lose peerlist's ref */
-        koibnal_put_peer (peer);
+        kibnal_put_peer (peer);
 }
 
 int
-koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
 {
-        koib_peer_t       *peer;
+        kib_peer_t        *peer;
         struct list_head  *ptmp;
         int                i;
 
-        read_lock (&koibnal_data.koib_global_lock);
+        read_lock (&kibnal_data.kib_global_lock);
 
-        for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
 
-                list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
                         
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -436,53 +452,53 @@ koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
                         *nidp = peer->ibp_nid;
                         *persistencep = peer->ibp_persistence;
                         
-                        read_unlock (&koibnal_data.koib_global_lock);
+                        read_unlock (&kibnal_data.kib_global_lock);
                         return (0);
                 }
         }
 
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
         return (-ENOENT);
 }
 
 int
-koibnal_add_persistent_peer (ptl_nid_t nid)
+kibnal_add_persistent_peer (ptl_nid_t nid)
 {
         unsigned long      flags;
-        koib_peer_t       *peer;
-        koib_peer_t       *peer2;
+        kib_peer_t        *peer;
+        kib_peer_t        *peer2;
         
         if (nid == PTL_NID_ANY)
                 return (-EINVAL);
 
-        peer = koibnal_create_peer (nid);
+        peer = kibnal_create_peer (nid);
         if (peer == NULL)
                 return (-ENOMEM);
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        peer2 = koibnal_find_peer_locked (nid);
+        peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
-                koibnal_put_peer (peer);
+                kibnal_put_peer (peer);
                 peer = peer2;
         } else {
                 /* peer table takes existing ref on peer */
                 list_add_tail (&peer->ibp_list,
-                               koibnal_nid2peerlist (nid));
+                               kibnal_nid2peerlist (nid));
         }
 
         peer->ibp_persistence++;
         
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
         return (0);
 }
 
 void
-koibnal_del_peer_locked (koib_peer_t *peer, int single_share)
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
 {
         struct list_head *ctmp;
         struct list_head *cnxt;
-        koib_conn_t      *conn;
+        kib_conn_t       *conn;
 
         if (!single_share)
                 peer->ibp_persistence = 0;
@@ -493,38 +509,38 @@ koibnal_del_peer_locked (koib_peer_t *peer, int single_share)
                 return;
 
         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
-                conn = list_entry(ctmp, koib_conn_t, ibc_list);
+                conn = list_entry(ctmp, kib_conn_t, ibc_list);
 
-                koibnal_close_conn_locked (conn, 0);
+                kibnal_close_conn_locked (conn, 0);
         }
 
         /* NB peer unlinks itself when last conn is closed */
 }
 
 int
-koibnal_del_peer (ptl_nid_t nid, int single_share)
+kibnal_del_peer (ptl_nid_t nid, int single_share)
 {
         unsigned long      flags;
         struct list_head  *ptmp;
         struct list_head  *pnxt;
-        koib_peer_t      *peer;
+        kib_peer_t        *peer;
         int                lo;
         int                hi;
         int                i;
         int                rc = -ENOENT;
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         if (nid != PTL_NID_ANY)
-                lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
-                hi = koibnal_data.koib_peer_hash_size - 1;
+                hi = kibnal_data.kib_peer_hash_size - 1;
         }
 
         for (i = lo; i <= hi; i++) {
-                list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -532,7 +548,7 @@ koibnal_del_peer (ptl_nid_t nid, int single_share)
                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
                                 continue;
 
-                        koibnal_del_peer_locked (peer, single_share);
+                        kibnal_del_peer_locked (peer, single_share);
                         rc = 0;         /* matched something */
 
                         if (single_share)
@@ -540,26 +556,26 @@ koibnal_del_peer (ptl_nid_t nid, int single_share)
                 }
         }
  out:
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         return (rc);
 }
 
-koib_conn_t *
-koibnal_get_conn_by_idx (int index)
+kib_conn_t *
+kibnal_get_conn_by_idx (int index)
 {
-        koib_peer_t       *peer;
+        kib_peer_t        *peer;
         struct list_head  *ptmp;
-        koib_conn_t       *conn;
+        kib_conn_t        *conn;
         struct list_head  *ctmp;
         int                i;
 
-        read_lock (&koibnal_data.koib_global_lock);
+        read_lock (&kibnal_data.kib_global_lock);
 
-        for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
-                list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
 
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence > 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -568,25 +584,25 @@ koibnal_get_conn_by_idx (int index)
                                 if (index-- > 0)
                                         continue;
 
-                                conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                                conn = list_entry (ctmp, kib_conn_t, ibc_list);
                                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                                        atomic_read (&conn->ibc_refcount));
                                 atomic_inc (&conn->ibc_refcount);
-                                read_unlock (&koibnal_data.koib_global_lock);
+                                read_unlock (&kibnal_data.kib_global_lock);
                                 return (conn);
                         }
                 }
         }
 
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
         return (NULL);
 }
 
-koib_conn_t *
-koibnal_create_conn (void)
+kib_conn_t *
+kibnal_create_conn (void)
 {
-        koib_conn_t *conn;
+        kib_conn_t  *conn;
         int          i;
         __u64        vaddr = 0;
         __u64        vaddr_base;
@@ -608,57 +624,57 @@ koibnal_create_conn (void)
         memset (conn, 0, sizeof (*conn));
 
         INIT_LIST_HEAD (&conn->ibc_tx_queue);
-        INIT_LIST_HEAD (&conn->ibc_rdma_queue);
+        INIT_LIST_HEAD (&conn->ibc_active_txs);
         spin_lock_init (&conn->ibc_lock);
         
-        atomic_inc (&koibnal_data.koib_nconns);
+        atomic_inc (&kibnal_data.kib_nconns);
         /* well not really, but I call destroy() on failure, which decrements */
 
-        PORTAL_ALLOC (conn->ibc_rxs, OPENIBNAL_RX_MSGS * sizeof (koib_rx_t));
+        PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
         if (conn->ibc_rxs == NULL)
                 goto failed;
-        memset (conn->ibc_rxs, 0, OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
+        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
-        rc = koibnal_alloc_pages(&conn->ibc_rx_pages,
-                                 OPENIBNAL_RX_MSG_PAGES,
-                                 IB_ACCESS_LOCAL_WRITE);
+        rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
+                                IBNAL_RX_MSG_PAGES,
+                                IB_ACCESS_LOCAL_WRITE);
         if (rc != 0)
                 goto failed;
 
-        vaddr_base = vaddr = conn->ibc_rx_pages->oibp_vaddr;
+        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
 
-        for (i = ipage = page_offset = 0; i < OPENIBNAL_RX_MSGS; i++) {
-                struct page *page = conn->ibc_rx_pages->oibp_pages[ipage];
-                koib_rx_t   *rx = &conn->ibc_rxs[i];
+        for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+                struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+                kib_rx_t   *rx = &conn->ibc_rxs[i];
 
                 rx->rx_conn = conn;
                 rx->rx_vaddr = vaddr;
-                rx->rx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
+                rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
                 
-                vaddr += OPENIBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + OPENIBNAL_RX_MSG_BYTES);
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
                 
-                page_offset += OPENIBNAL_MSG_SIZE;
+                page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= OPENIBNAL_RX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
                 }
         }
 
         params.qp_create = (struct ib_qp_create_param) {
                 .limit = {
                         /* Sends have an optional RDMA */
-                        .max_outstanding_send_request    = 2 * OPENIBNAL_MSG_QUEUE_SIZE,
-                        .max_outstanding_receive_request = OPENIBNAL_MSG_QUEUE_SIZE,
+                        .max_outstanding_send_request    = 2 * IBNAL_MSG_QUEUE_SIZE,
+                        .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
                         .max_send_gather_element         = 1,
                         .max_receive_scatter_element     = 1,
                 },
-                .pd              = koibnal_data.koib_pd,
-                .send_queue      = koibnal_data.koib_tx_cq,
-                .receive_queue   = koibnal_data.koib_rx_cq,
+                .pd              = kibnal_data.kib_pd,
+                .send_queue      = kibnal_data.kib_cq,
+                .receive_queue   = kibnal_data.kib_cq,
                 .send_policy     = IB_WQ_SIGNAL_SELECTABLE,
                 .receive_policy  = IB_WQ_SIGNAL_SELECTABLE,
                 .rd_domain       = 0,
@@ -673,11 +689,11 @@ koibnal_create_conn (void)
         }
         
         /* Mark QP created */
-        conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
+        conn->ibc_state = IBNAL_CONN_INIT_QP;
 
         params.qp_attr = (struct ib_qp_attribute) {
                 .state             = IB_QP_STATE_INIT,
-                .port              = koibnal_data.koib_port,
+                .port              = kibnal_data.kib_port,
                 .enable_rdma_read  = 1,
                 .enable_rdma_write = 1,
                 .valid_fields      = (IB_QP_ATTRIBUTE_STATE |
@@ -696,12 +712,12 @@ koibnal_create_conn (void)
         return (conn);
         
  failed:
-        koibnal_destroy_conn (conn);
+        kibnal_destroy_conn (conn);
         return (NULL);
 }
 
 void
-koibnal_destroy_conn (koib_conn_t *conn)
+kibnal_destroy_conn (kib_conn_t *conn)
 {
         int    rc;
         
@@ -709,21 +725,21 @@ koibnal_destroy_conn (koib_conn_t *conn)
 
         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
         LASSERT (list_empty(&conn->ibc_tx_queue));
-        LASSERT (list_empty(&conn->ibc_rdma_queue));
+        LASSERT (list_empty(&conn->ibc_active_txs));
         LASSERT (conn->ibc_nsends_posted == 0);
         LASSERT (conn->ibc_connreq == NULL);
 
         switch (conn->ibc_state) {
-        case OPENIBNAL_CONN_ZOMBIE:
+        case IBNAL_CONN_ZOMBIE:
                 /* called after connection sequence initiated */
 
-        case OPENIBNAL_CONN_INIT_QP:
+        case IBNAL_CONN_INIT_QP:
                 rc = ib_qp_destroy(conn->ibc_qp);
                 if (rc != 0)
                         CERROR("Can't destroy QP: %d\n", rc);
                 /* fall through */
                 
-        case OPENIBNAL_CONN_INIT_NOTHING:
+        case IBNAL_CONN_INIT_NOTHING:
                 break;
 
         default:
@@ -731,30 +747,30 @@ koibnal_destroy_conn (koib_conn_t *conn)
         }
 
         if (conn->ibc_rx_pages != NULL) 
-                koibnal_free_pages(conn->ibc_rx_pages);
+                kibnal_free_pages(conn->ibc_rx_pages);
         
         if (conn->ibc_rxs != NULL)
                 PORTAL_FREE(conn->ibc_rxs, 
-                            OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
+                            IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
         if (conn->ibc_peer != NULL)
-                koibnal_put_peer(conn->ibc_peer);
+                kibnal_put_peer(conn->ibc_peer);
 
         PORTAL_FREE(conn, sizeof (*conn));
 
-        atomic_dec(&koibnal_data.koib_nconns);
+        atomic_dec(&kibnal_data.kib_nconns);
         
-        if (atomic_read (&koibnal_data.koib_nconns) == 0 &&
-            koibnal_data.koib_shutdown) {
+        if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+            kibnal_data.kib_shutdown) {
                 /* I just nuked the last connection on shutdown; wake up
                  * everyone so they can exit. */
-                wake_up_all(&koibnal_data.koib_sched_waitq);
-                wake_up_all(&koibnal_data.koib_connd_waitq);
+                wake_up_all(&kibnal_data.kib_sched_waitq);
+                wake_up_all(&kibnal_data.kib_connd_waitq);
         }
 }
 
 void
-koibnal_put_conn (koib_conn_t *conn)
+kibnal_put_conn (kib_conn_t *conn)
 {
         unsigned long flags;
 
@@ -767,44 +783,44 @@ koibnal_put_conn (koib_conn_t *conn)
                 return;
 
         /* last ref only goes on zombies */
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ZOMBIE);
+        LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
 
-        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
 
-        list_add (&conn->ibc_list, &koibnal_data.koib_connd_conns);
-        wake_up (&koibnal_data.koib_connd_waitq);
+        list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
 
-        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 }
 
 int
-koibnal_close_peer_conns_locked (koib_peer_t *peer, int why)
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
 {
-        koib_conn_t        *conn;
+        kib_conn_t         *conn;
         struct list_head   *ctmp;
         struct list_head   *cnxt;
         int                 count = 0;
 
         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
-                conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
 
                 count++;
-                koibnal_close_conn_locked (conn, why);
+                kibnal_close_conn_locked (conn, why);
         }
 
         return (count);
 }
 
 int
-koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation)
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
 {
-        koib_conn_t        *conn;
+        kib_conn_t         *conn;
         struct list_head   *ctmp;
         struct list_head   *cnxt;
         int                 count = 0;
 
         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
-                conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
 
                 if (conn->ibc_incarnation == incarnation)
                         continue;
@@ -813,17 +829,17 @@ koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation)
                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
                 
                 count++;
-                koibnal_close_conn_locked (conn, -ESTALE);
+                kibnal_close_conn_locked (conn, -ESTALE);
         }
 
         return (count);
 }
 
 int
-koibnal_close_matching_conns (ptl_nid_t nid)
+kibnal_close_matching_conns (ptl_nid_t nid)
 {
         unsigned long       flags;
-        koib_peer_t        *peer;
+        kib_peer_t         *peer;
         struct list_head   *ptmp;
         struct list_head   *pnxt;
         int                 lo;
@@ -831,19 +847,19 @@ koibnal_close_matching_conns (ptl_nid_t nid)
         int                 i;
         int                 count = 0;
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         if (nid != PTL_NID_ANY)
-                lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
-                hi = koibnal_data.koib_peer_hash_size - 1;
+                hi = kibnal_data.kib_peer_hash_size - 1;
         }
 
         for (i = lo; i <= hi; i++) {
-                list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
 
-                        peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
                                  !list_empty (&peer->ibp_conns));
@@ -851,11 +867,11 @@ koibnal_close_matching_conns (ptl_nid_t nid)
                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
                                 continue;
 
-                        count += koibnal_close_peer_conns_locked (peer, 0);
+                        count += kibnal_close_peer_conns_locked (peer, 0);
                 }
         }
 
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         /* wildcards always succeed */
         if (nid == PTL_NID_ANY)
@@ -865,7 +881,7 @@ koibnal_close_matching_conns (ptl_nid_t nid)
 }
 
 int
-koibnal_cmd(struct portals_cfg *pcfg, void * private)
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
 {
         int rc = -EINVAL;
 
@@ -876,8 +892,8 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
                 ptl_nid_t   nid = 0;
                 int         share_count = 0;
 
-                rc = koibnal_get_peer_info(pcfg->pcfg_count,
-                                           &nid, &share_count);
+                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                                          &nid, &share_count);
                 pcfg->pcfg_nid   = nid;
                 pcfg->pcfg_size  = 0;
                 pcfg->pcfg_id    = 0;
@@ -887,17 +903,17 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
                 break;
         }
         case NAL_CMD_ADD_PEER: {
-                rc = koibnal_add_persistent_peer (pcfg->pcfg_nid);
+                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
                 break;
         }
         case NAL_CMD_DEL_PEER: {
-                rc = koibnal_del_peer (pcfg->pcfg_nid, 
+                rc = kibnal_del_peer (pcfg->pcfg_nid, 
                                        /* flags == single_share */
                                        pcfg->pcfg_flags != 0);
                 break;
         }
         case NAL_CMD_GET_CONN: {
-                koib_conn_t *conn = koibnal_get_conn_by_idx (pcfg->pcfg_count);
+                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
 
                 if (conn == NULL)
                         rc = -ENOENT;
@@ -907,19 +923,19 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
                         pcfg->pcfg_id    = 0;
                         pcfg->pcfg_misc  = 0;
                         pcfg->pcfg_flags = 0;
-                        koibnal_put_conn (conn);
+                        kibnal_put_conn (conn);
                 }
                 break;
         }
         case NAL_CMD_CLOSE_CONNECTION: {
-                rc = koibnal_close_matching_conns (pcfg->pcfg_nid);
+                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
                 break;
         }
         case NAL_CMD_REGISTER_MYNID: {
                 if (pcfg->pcfg_nid == PTL_NID_ANY)
                         rc = -EINVAL;
                 else
-                        rc = koibnal_set_mynid (pcfg->pcfg_nid);
+                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
                 break;
         }
         }
@@ -928,47 +944,47 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private)
 }
 
 void
-koibnal_free_pages (koib_pages_t *p)
+kibnal_free_pages (kib_pages_t *p)
 {
-        int     npages = p->oibp_npages;
+        int     npages = p->ibp_npages;
         int     rc;
         int     i;
         
-        if (p->oibp_mapped) {
-                rc = ib_memory_deregister(p->oibp_handle);
+        if (p->ibp_mapped) {
+                rc = ib_memory_deregister(p->ibp_handle);
                 if (rc != 0)
                         CERROR ("Deregister error: %d\n", rc);
         }
         
         for (i = 0; i < npages; i++)
-                if (p->oibp_pages[i] != NULL)
-                        __free_page(p->oibp_pages[i]);
+                if (p->ibp_pages[i] != NULL)
+                        __free_page(p->ibp_pages[i]);
         
-        PORTAL_FREE (p, offsetof(koib_pages_t, oibp_pages[npages]));
+        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
 }
 
 int
-koibnal_alloc_pages (koib_pages_t **pp, int npages, int access)
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
 {
-        koib_pages_t               *p;
+        kib_pages_t                *p;
         struct ib_physical_buffer  *phys_pages;
         int                         i;
         int                         rc;
 
-        PORTAL_ALLOC(p, offsetof(koib_pages_t, oibp_pages[npages]));
+        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
                 CERROR ("Can't allocate buffer %d\n", npages);
                 return (-ENOMEM);
         }
 
-        memset (p, 0, offsetof(koib_pages_t, oibp_pages[npages]));
-        p->oibp_npages = npages;
+        memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+        p->ibp_npages = npages;
         
         for (i = 0; i < npages; i++) {
-                p->oibp_pages[i] = alloc_page (GFP_KERNEL);
-                if (p->oibp_pages[i] == NULL) {
+                p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+                if (p->ibp_pages[i] == NULL) {
                         CERROR ("Can't allocate page %d of %d\n", i, npages);
-                        koibnal_free_pages(p);
+                        kibnal_free_pages(p);
                         return (-ENOMEM);
                 }
         }
@@ -976,96 +992,96 @@ koibnal_alloc_pages (koib_pages_t **pp, int npages, int access)
         PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
         if (phys_pages == NULL) {
                 CERROR ("Can't allocate physarray for %d pages\n", npages);
-                koibnal_free_pages(p);
+                kibnal_free_pages(p);
                 return (-ENOMEM);
         }
 
         for (i = 0; i < npages; i++) {
                 phys_pages[i].size = PAGE_SIZE;
                 phys_pages[i].address =
-                        koibnal_page2phys(p->oibp_pages[i]);
+                        kibnal_page2phys(p->ibp_pages[i]);
         }
 
-        p->oibp_vaddr = 0;
-        rc = ib_memory_register_physical(koibnal_data.koib_pd,
+        p->ibp_vaddr = 0;
+        rc = ib_memory_register_physical(kibnal_data.kib_pd,
                                          phys_pages, npages,
-                                         &p->oibp_vaddr,
+                                         &p->ibp_vaddr,
                                          npages * PAGE_SIZE, 0,
                                          access,
-                                         &p->oibp_handle,
-                                         &p->oibp_lkey,
-                                         &p->oibp_rkey);
+                                         &p->ibp_handle,
+                                         &p->ibp_lkey,
+                                         &p->ibp_rkey);
         
         PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
         
         if (rc != 0) {
                 CERROR ("Error %d mapping %d pages\n", rc, npages);
-                koibnal_free_pages(p);
+                kibnal_free_pages(p);
                 return (rc);
         }
         
-        p->oibp_mapped = 1;
+        p->ibp_mapped = 1;
         *pp = p;
         return (0);
 }
 
 int
-koibnal_setup_tx_descs (void)
+kibnal_setup_tx_descs (void)
 {
         int           ipage = 0;
         int           page_offset = 0;
         __u64         vaddr;
         __u64         vaddr_base;
         struct page  *page;
-        koib_tx_t    *tx;
+        kib_tx_t     *tx;
         int           i;
         int           rc;
 
         /* pre-mapped messages are not bigger than 1 page */
-        LASSERT (OPENIBNAL_MSG_SIZE <= PAGE_SIZE);
+        LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
 
         /* No fancy arithmetic when we do the buffer calculations */
-        LASSERT (PAGE_SIZE % OPENIBNAL_MSG_SIZE == 0);
+        LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
 
-        rc = koibnal_alloc_pages(&koibnal_data.koib_tx_pages,
-                                 OPENIBNAL_TX_MSG_PAGES, 
-                                 0);            /* local read access only */
+        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
+                                IBNAL_TX_MSG_PAGES, 
+                                0);            /* local read access only */
         if (rc != 0)
                 return (rc);
 
-        vaddr = vaddr_base = koibnal_data.koib_tx_pages->oibp_vaddr;
+        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
 
-        for (i = 0; i < OPENIBNAL_TX_MSGS; i++) {
-                page = koibnal_data.koib_tx_pages->oibp_pages[ipage];
-                tx = &koibnal_data.koib_tx_descs[i];
+        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+                page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+                tx = &kibnal_data.kib_tx_descs[i];
 
                 memset (tx, 0, sizeof(*tx));    /* zero flags etc */
                 
-                tx->tx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
+                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
                 tx->tx_vaddr = vaddr;
-                tx->tx_isnblk = (i >= OPENIBNAL_NTX);
-                tx->tx_mapped = KOIB_TX_UNMAPPED;
+                tx->tx_isnblk = (i >= IBNAL_NTX);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
 
                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
                        i, tx, tx->tx_msg, tx->tx_vaddr);
 
                 if (tx->tx_isnblk)
                         list_add (&tx->tx_list, 
-                                  &koibnal_data.koib_idle_nblk_txs);
+                                  &kibnal_data.kib_idle_nblk_txs);
                 else
                         list_add (&tx->tx_list, 
-                                  &koibnal_data.koib_idle_txs);
+                                  &kibnal_data.kib_idle_txs);
 
-                vaddr += OPENIBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + OPENIBNAL_TX_MSG_BYTES);
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
 
-                page_offset += OPENIBNAL_MSG_SIZE;
+                page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= OPENIBNAL_TX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
                 }
         }
         
@@ -1073,7 +1089,7 @@ koibnal_setup_tx_descs (void)
 }
 
 void
-koibnal_api_shutdown (nal_t *nal)
+kibnal_api_shutdown (nal_t *nal)
 {
         int   i;
         int   rc;
@@ -1087,119 +1103,113 @@ koibnal_api_shutdown (nal_t *nal)
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
                atomic_read (&portal_kmemory));
 
-        LASSERT(nal == &koibnal_api);
+        LASSERT(nal == &kibnal_api);
 
-        switch (koibnal_data.koib_init) {
+        switch (kibnal_data.kib_init) {
         default:
-                CERROR ("Unexpected state %d\n", koibnal_data.koib_init);
+                CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
                 LBUG();
 
-        case OPENIBNAL_INIT_ALL:
+        case IBNAL_INIT_ALL:
                 /* stop calls to nal_cmd */
                 libcfs_nal_cmd_unregister(OPENIBNAL);
                 /* No new peers */
 
                 /* resetting my NID to unadvertises me, removes my
                  * listener and nukes all current peers */
-                koibnal_set_mynid (PTL_NID_ANY);
+                kibnal_set_mynid (PTL_NID_ANY);
 
                 /* Wait for all peer state to clean up */
                 i = 2;
-                while (atomic_read (&koibnal_data.koib_npeers) != 0) {
+                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "waiting for %d peers to close down\n",
-                               atomic_read (&koibnal_data.koib_npeers));
+                               atomic_read (&kibnal_data.kib_npeers));
                         set_current_state (TASK_INTERRUPTIBLE);
                         schedule_timeout (HZ);
                 }
                 /* fall through */
 
-        case OPENIBNAL_INIT_TX_CQ:
-                rc = ib_cq_destroy (koibnal_data.koib_tx_cq);
-                if (rc != 0)
-                        CERROR ("Destroy tx CQ error: %d\n", rc);
-                /* fall through */
-
-        case OPENIBNAL_INIT_RX_CQ:
-                rc = ib_cq_destroy (koibnal_data.koib_rx_cq);
+        case IBNAL_INIT_CQ:
+                rc = ib_cq_destroy (kibnal_data.kib_cq);
                 if (rc != 0)
-                        CERROR ("Destroy rx CQ error: %d\n", rc);
+                        CERROR ("Destroy CQ error: %d\n", rc);
                 /* fall through */
 
-        case OPENIBNAL_INIT_TXD:
-                koibnal_free_pages (koibnal_data.koib_tx_pages);
+        case IBNAL_INIT_TXD:
+                kibnal_free_pages (kibnal_data.kib_tx_pages);
                 /* fall through */
-#if OPENIBNAL_FMR
-        case OPENIBNAL_INIT_FMR:
-                rc = ib_fmr_pool_destroy (koibnal_data.koib_fmr_pool);
+#if IBNAL_FMR
+        case IBNAL_INIT_FMR:
+                rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
                 if (rc != 0)
                         CERROR ("Destroy FMR pool error: %d\n", rc);
                 /* fall through */
 #endif
-        case OPENIBNAL_INIT_PD:
-                rc = ib_pd_destroy(koibnal_data.koib_pd);
+        case IBNAL_INIT_PD:
+                rc = ib_pd_destroy(kibnal_data.kib_pd);
                 if (rc != 0)
                         CERROR ("Destroy PD error: %d\n", rc);
                 /* fall through */
 
-        case OPENIBNAL_INIT_LIB:
-                lib_fini(&koibnal_lib);
+        case IBNAL_INIT_LIB:
+                lib_fini(&kibnal_lib);
                 /* fall through */
 
-        case OPENIBNAL_INIT_DATA:
+        case IBNAL_INIT_DATA:
                 /* Module refcount only gets to zero when all peers
                  * have been closed so all lists must be empty */
-                LASSERT (atomic_read (&koibnal_data.koib_npeers) == 0);
-                LASSERT (koibnal_data.koib_peers != NULL);
-                for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
-                        LASSERT (list_empty (&koibnal_data.koib_peers[i]));
+                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (kibnal_data.kib_peers != NULL);
+                for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                        LASSERT (list_empty (&kibnal_data.kib_peers[i]));
                 }
-                LASSERT (atomic_read (&koibnal_data.koib_nconns) == 0);
-                LASSERT (list_empty (&koibnal_data.koib_sched_rxq));
-                LASSERT (list_empty (&koibnal_data.koib_sched_txq));
-                LASSERT (list_empty (&koibnal_data.koib_connd_conns));
-                LASSERT (list_empty (&koibnal_data.koib_connd_peers));
+                LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+                LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+                LASSERT (list_empty (&kibnal_data.kib_connd_peers));
 
                 /* flag threads to terminate; wake and wait for them to die */
-                koibnal_data.koib_shutdown = 1;
-                wake_up_all (&koibnal_data.koib_sched_waitq);
-                wake_up_all (&koibnal_data.koib_connd_waitq);
+                kibnal_data.kib_shutdown = 1;
+                wake_up_all (&kibnal_data.kib_sched_waitq);
+                wake_up_all (&kibnal_data.kib_connd_waitq);
 
                 i = 2;
-                while (atomic_read (&koibnal_data.koib_nthreads) != 0) {
+                while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "Waiting for %d threads to terminate\n",
-                               atomic_read (&koibnal_data.koib_nthreads));
+                               atomic_read (&kibnal_data.kib_nthreads));
                         set_current_state (TASK_INTERRUPTIBLE);
                         schedule_timeout (HZ);
                 }
                 /* fall through */
                 
-        case OPENIBNAL_INIT_NOTHING:
+        case IBNAL_INIT_NOTHING:
                 break;
         }
 
-        if (koibnal_data.koib_tx_descs != NULL)
-                PORTAL_FREE (koibnal_data.koib_tx_descs,
-                             OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
+        if (kibnal_data.kib_tx_descs != NULL)
+                PORTAL_FREE (kibnal_data.kib_tx_descs,
+                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
 
-        if (koibnal_data.koib_peers != NULL)
-                PORTAL_FREE (koibnal_data.koib_peers,
+        if (kibnal_data.kib_peers != NULL)
+                PORTAL_FREE (kibnal_data.kib_peers,
                              sizeof (struct list_head) * 
-                             koibnal_data.koib_peer_hash_size);
+                             kibnal_data.kib_peer_hash_size);
 
         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
                atomic_read (&portal_kmemory));
         printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
                atomic_read(&portal_kmemory));
 
-        koibnal_data.koib_init = OPENIBNAL_INIT_NOTHING;
+        kibnal_data.kib_init = IBNAL_INIT_NOTHING;
 }
 
 int
-koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                      ptl_ni_limits_t *requested_limits,
                      ptl_ni_limits_t *actual_limits)
 {
@@ -1208,65 +1218,66 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         int               rc;
         int               i;
 
-        LASSERT (nal == &koibnal_api);
+        LASSERT (nal == &kibnal_api);
 
         if (nal->nal_refct != 0) {
                 if (actual_limits != NULL)
-                        *actual_limits = koibnal_lib.libnal_ni.ni_actual_limits;
+                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
                 /* This module got the first ref */
                 PORTAL_MODULE_USE;
                 return (PTL_OK);
         }
 
-        LASSERT (koibnal_data.koib_init == OPENIBNAL_INIT_NOTHING);
+        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
 
-        memset (&koibnal_data, 0, sizeof (koibnal_data)); /* zero pointers, flags etc */
+        memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
 
-        init_MUTEX (&koibnal_data.koib_nid_mutex);
-        init_MUTEX_LOCKED (&koibnal_data.koib_nid_signal);
-        koibnal_data.koib_nid = PTL_NID_ANY;
+        init_MUTEX (&kibnal_data.kib_nid_mutex);
+        init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+        kibnal_data.kib_nid = PTL_NID_ANY;
 
-        rwlock_init(&koibnal_data.koib_global_lock);
+        rwlock_init(&kibnal_data.kib_global_lock);
 
-        koibnal_data.koib_peer_hash_size = OPENIBNAL_PEER_HASH_SIZE;
-        PORTAL_ALLOC (koibnal_data.koib_peers,
-                      sizeof (struct list_head) * koibnal_data.koib_peer_hash_size);
-        if (koibnal_data.koib_peers == NULL) {
+        kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+        PORTAL_ALLOC (kibnal_data.kib_peers,
+                      sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+        if (kibnal_data.kib_peers == NULL) {
                 goto failed;
         }
-        for (i = 0; i < koibnal_data.koib_peer_hash_size; i++)
-                INIT_LIST_HEAD(&koibnal_data.koib_peers[i]);
-
-        spin_lock_init (&koibnal_data.koib_connd_lock);
-        INIT_LIST_HEAD (&koibnal_data.koib_connd_peers);
-        INIT_LIST_HEAD (&koibnal_data.koib_connd_conns);
-        init_waitqueue_head (&koibnal_data.koib_connd_waitq);
-
-        spin_lock_init (&koibnal_data.koib_sched_lock);
-        INIT_LIST_HEAD (&koibnal_data.koib_sched_txq);
-        INIT_LIST_HEAD (&koibnal_data.koib_sched_rxq);
-        init_waitqueue_head (&koibnal_data.koib_sched_waitq);
-
-        spin_lock_init (&koibnal_data.koib_tx_lock);
-        INIT_LIST_HEAD (&koibnal_data.koib_idle_txs);
-        INIT_LIST_HEAD (&koibnal_data.koib_idle_nblk_txs);
-        init_waitqueue_head(&koibnal_data.koib_idle_tx_waitq);
-
-        PORTAL_ALLOC (koibnal_data.koib_tx_descs,
-                      OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
-        if (koibnal_data.koib_tx_descs == NULL) {
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+                INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+        spin_lock_init (&kibnal_data.kib_connd_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+        init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+        spin_lock_init (&kibnal_data.kib_sched_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+        init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+        spin_lock_init (&kibnal_data.kib_tx_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        if (kibnal_data.kib_tx_descs == NULL) {
                 CERROR ("Can't allocate tx descs\n");
                 goto failed;
         }
 
         /* lists/ptrs/locks initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_DATA;
+        kibnal_data.kib_init = IBNAL_INIT_DATA;
         /*****************************************************/
 
+
         process_id.pid = requested_pid;
-        process_id.nid = koibnal_data.koib_nid;
+        process_id.nid = kibnal_data.kib_nid;
         
-        rc = lib_init(&koibnal_lib, nal, process_id,
+        rc = lib_init(&kibnal_lib, nal, process_id,
                       requested_limits, actual_limits);
         if (rc != PTL_OK) {
                 CERROR("lib_init failed: error %d\n", rc);
@@ -1274,11 +1285,11 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         }
 
         /* lib interface initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_LIB;
+        kibnal_data.kib_init = IBNAL_INIT_LIB;
         /*****************************************************/
 
-        for (i = 0; i < OPENIBNAL_N_SCHED; i++) {
-                rc = koibnal_thread_start (koibnal_scheduler, (void *)i);
+        for (i = 0; i < IBNAL_N_SCHED; i++) {
+                rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
                 if (rc != 0) {
                         CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
                                i, rc);
@@ -1286,56 +1297,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 }
         }
 
-        rc = koibnal_thread_start (koibnal_connd, NULL);
+        rc = kibnal_thread_start (kibnal_connd, NULL);
         if (rc != 0) {
                 CERROR ("Can't spawn openibnal connd: %d\n", rc);
                 goto failed;
         }
 
-        koibnal_data.koib_device = ib_device_get_by_index(0);
-        if (koibnal_data.koib_device == NULL) {
+        kibnal_data.kib_device = ib_device_get_by_index(0);
+        if (kibnal_data.kib_device == NULL) {
                 CERROR ("Can't open ib device 0\n");
                 goto failed;
         }
         
-        rc = ib_device_properties_get(koibnal_data.koib_device,
-                                      &koibnal_data.koib_device_props);
+        rc = ib_device_properties_get(kibnal_data.kib_device,
+                                      &kibnal_data.kib_device_props);
         if (rc != 0) {
                 CERROR ("Can't get device props: %d\n", rc);
                 goto failed;
         }
 
         CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n", 
-               koibnal_data.koib_device_props.max_initiator_per_qp,
-               koibnal_data.koib_device_props.max_responder_per_qp);
+               kibnal_data.kib_device_props.max_initiator_per_qp,
+               kibnal_data.kib_device_props.max_responder_per_qp);
 
-        koibnal_data.koib_port = 0;
+        kibnal_data.kib_port = 0;
         for (i = 1; i <= 2; i++) {
-                rc = ib_port_properties_get(koibnal_data.koib_device, i,
-                                            &koibnal_data.koib_port_props);
+                rc = ib_port_properties_get(kibnal_data.kib_device, i,
+                                            &kibnal_data.kib_port_props);
                 if (rc == 0) {
-                        koibnal_data.koib_port = i;
+                        kibnal_data.kib_port = i;
                         break;
                 }
         }
-        if (koibnal_data.koib_port == 0) {
+        if (kibnal_data.kib_port == 0) {
                 CERROR ("Can't find a port\n");
                 goto failed;
         }
 
-        rc = ib_pd_create(koibnal_data.koib_device,
-                          NULL, &koibnal_data.koib_pd);
+        rc = ib_pd_create(kibnal_data.kib_device,
+                          NULL, &kibnal_data.kib_pd);
         if (rc != 0) {
                 CERROR ("Can't create PD: %d\n", rc);
                 goto failed;
         }
         
         /* flag PD initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_PD;
+        kibnal_data.kib_init = IBNAL_INIT_PD;
         /*****************************************************/
-#if OPENIBNAL_FMR
+#if IBNAL_FMR
         {
-                const int pool_size = OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK;
+                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
                 struct ib_fmr_pool_param params = {
                         .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
                         .access            = (IB_ACCESS_LOCAL_WRITE |
@@ -1347,8 +1358,8 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                         .flush_arg         = NULL,
                         .cache             = 1,
                 };
-                rc = ib_fmr_pool_create(koibnal_data.koib_pd, &params,
-                                        &koibnal_data.koib_fmr_pool);
+                rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
+                                        &kibnal_data.kib_fmr_pool);
                 if (rc != 0) {
                         CERROR ("Can't create FMR pool size %d: %d\n", 
                                 pool_size, rc);
@@ -1357,84 +1368,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         }
 
         /* flag FMR pool initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_FMR;
+        kibnal_data.kib_init = IBNAL_INIT_FMR;
 #endif
         /*****************************************************/
 
-        rc = koibnal_setup_tx_descs();
+        rc = kibnal_setup_tx_descs();
         if (rc != 0) {
                 CERROR ("Can't register tx descs: %d\n", rc);
                 goto failed;
         }
         
         /* flag TX descs initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_TXD;
+        kibnal_data.kib_init = IBNAL_INIT_TXD;
         /*****************************************************/
         
         {
                 struct ib_cq_callback callback = {
-                        .context        = OPENIBNAL_CALLBACK_CTXT,
+                        .context        = IBNAL_CALLBACK_CTXT,
                         .policy         = IB_CQ_PROVIDER_REARM,
                         .function       = {
-                                .entry  = koibnal_rx_callback,
+                                .entry  = kibnal_callback,
                         },
                         .arg            = NULL,
                 };
-                int  nentries = OPENIBNAL_RX_CQ_ENTRIES;
+                int  nentries = IBNAL_CQ_ENTRIES;
                 
-                rc = ib_cq_create (koibnal_data.koib_device, 
+                rc = ib_cq_create (kibnal_data.kib_device, 
                                    &nentries, &callback, NULL,
-                                   &koibnal_data.koib_rx_cq);
+                                   &kibnal_data.kib_cq);
                 if (rc != 0) {
-                        CERROR ("Can't create RX CQ: %d\n", rc);
+                        CERROR ("Can't create CQ: %d\n", rc);
                         goto failed;
                 }
 
                 /* I only want solicited events */
-                rc = ib_cq_request_notification(koibnal_data.koib_rx_cq, 1);
+                rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
                 LASSERT (rc == 0);
         }
         
-        /* flag RX CQ initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_RX_CQ;
-        /*****************************************************/
-
-        {
-                struct ib_cq_callback callback = {
-                        .context        = OPENIBNAL_CALLBACK_CTXT,
-                        .policy         = IB_CQ_PROVIDER_REARM,
-                        .function       = {
-                                .entry  = koibnal_tx_callback,
-                        },
-                        .arg            = NULL,
-                };
-                int  nentries = OPENIBNAL_TX_CQ_ENTRIES;
-                
-                rc = ib_cq_create (koibnal_data.koib_device, 
-                                   &nentries, &callback, NULL,
-                                   &koibnal_data.koib_tx_cq);
-                if (rc != 0) {
-                        CERROR ("Can't create RX CQ: %d\n", rc);
-                        goto failed;
-                }
-
-                /* I only want solicited events */
-                rc = ib_cq_request_notification(koibnal_data.koib_tx_cq, 1);
-                LASSERT (rc == 0);
-        }
-                                   
-        /* flag TX CQ initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_TX_CQ;
+        /* flag CQ initialised */
+        kibnal_data.kib_init = IBNAL_INIT_CQ;
         /*****************************************************/
         
-        rc = libcfs_nal_cmd_register(OPENIBNAL, &koibnal_cmd, NULL);
+        rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
         if (rc != 0) {
                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
                 goto failed;
         }
 
         /* flag everything initialised */
-        koibnal_data.koib_init = OPENIBNAL_INIT_ALL;
+        kibnal_data.kib_init = IBNAL_INIT_ALL;
         /*****************************************************/
 
         printk(KERN_INFO "Lustre: OpenIB NAL loaded "
@@ -1443,44 +1426,44 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         return (PTL_OK);
 
  failed:
-        koibnal_api_shutdown (&koibnal_api);    
+        kibnal_api_shutdown (&kibnal_api);    
         return (PTL_FAIL);
 }
 
 void __exit
-koibnal_module_fini (void)
+kibnal_module_fini (void)
 {
 #ifdef CONFIG_SYSCTL
-        if (koibnal_tunables.koib_sysctl != NULL)
-                unregister_sysctl_table (koibnal_tunables.koib_sysctl);
+        if (kibnal_tunables.kib_sysctl != NULL)
+                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
 #endif
-        PtlNIFini(koibnal_ni);
+        PtlNIFini(kibnal_ni);
 
         ptl_unregister_nal(OPENIBNAL);
 }
 
 int __init
-koibnal_module_init (void)
+kibnal_module_init (void)
 {
         int    rc;
 
         /* the following must be sizeof(int) for proc_dointvec() */
-        LASSERT(sizeof (koibnal_tunables.koib_io_timeout) == sizeof (int));
+        LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
 
-        koibnal_api.nal_ni_init = koibnal_api_startup;
-        koibnal_api.nal_ni_fini = koibnal_api_shutdown;
+        kibnal_api.nal_ni_init = kibnal_api_startup;
+        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
 
         /* Initialise dynamic tunables to defaults once only */
-        koibnal_tunables.koib_io_timeout = OPENIBNAL_IO_TIMEOUT;
+        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
 
-        rc = ptl_register_nal(OPENIBNAL, &koibnal_api);
+        rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
         if (rc != PTL_OK) {
-                CERROR("Can't register OPENIBNAL: %d\n", rc);
+                CERROR("Can't register IBNAL: %d\n", rc);
                 return (-ENOMEM);               /* or something... */
         }
 
         /* Pure gateways want the NAL started up at module load time... */
-        rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &koibnal_ni);
+        rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
                 ptl_unregister_nal(OPENIBNAL);
                 return (-ENODEV);
@@ -1488,8 +1471,8 @@ koibnal_module_init (void)
         
 #ifdef CONFIG_SYSCTL
         /* Press on regardless even if registering sysctl doesn't work */
-        koibnal_tunables.koib_sysctl = 
-                register_sysctl_table (koibnal_top_ctl_table, 0);
+        kibnal_tunables.kib_sysctl = 
+                register_sysctl_table (kibnal_top_ctl_table, 0);
 #endif
         return (0);
 }
@@ -1498,6 +1481,6 @@ MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
 MODULE_LICENSE("GPL");
 
-module_init(koibnal_module_init);
-module_exit(koibnal_module_fini);
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
 
index 301d3ae..f0610f2 100644 (file)
@@ -48,7 +48,7 @@
 #include <linux/kmod.h>
 #include <linux/sysctl.h>
 
-#define DEBUG_SUBSYSTEM S_OPENIBNAL
+#define DEBUG_SUBSYSTEM S_IBNAL
 
 #include <linux/kp30.h>
 #include <portals/p30.h>
 #include <ts_ib_cm.h>
 #include <ts_ib_sa_client.h>
 
-#define OPENIBNAL_SERVICE_NAME   "openibnal"
+#define IBNAL_SERVICE_NAME   "openibnal"
 
 #if CONFIG_SMP
-# define OPENIBNAL_N_SCHED      num_online_cpus() /* # schedulers */
+# define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
 #else
-# define OPENIBNAL_N_SCHED      1                 /* # schedulers */
+# define IBNAL_N_SCHED      1                   /* # schedulers */
 #endif
 
-#define OPENIBNAL_MIN_RECONNECT_INTERVAL HZ       /* first failed connection retry... */
-#define OPENIBNAL_MAX_RECONNECT_INTERVAL (60*HZ)  /* ...exponentially increasing to this */
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
 
-#define OPENIBNAL_MSG_SIZE       (4<<10)          /* max size of queued messages (inc hdr) */
+#define IBNAL_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
 
-#define OPENIBNAL_MSG_QUEUE_SIZE   8              /* # messages in-flight */
-#define OPENIBNAL_CREDIT_HIGHWATER 6              /* when to eagerly return credits */
-#define OPENIBNAL_RETRY            7              /* # times to retry */
-#define OPENIBNAL_RNR_RETRY        7              /*  */
-#define OPENIBNAL_CM_RETRY         7              /* # times to retry connection */
-#define OPENIBNAL_FLOW_CONTROL     1
-#define OPENIBNAL_RESPONDER_RESOURCES 8
+#define IBNAL_MSG_QUEUE_SIZE   8                /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 6                /* when to eagerly return credits */
+#define IBNAL_RETRY            7                /* # times to retry */
+#define IBNAL_RNR_RETRY        7                /*  */
+#define IBNAL_CM_RETRY         7                /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL     1
+#define IBNAL_RESPONDER_RESOURCES 8
 
-#define OPENIBNAL_NTX             64              /* # tx descs */
-#define OPENIBNAL_NTX_NBLK        256             /* # reserved tx descs */
+#define IBNAL_NTX             64                /* # tx descs */
+#define IBNAL_NTX_NBLK        256               /* # reserved tx descs */
 
-#define OPENIBNAL_PEER_HASH_SIZE  101             /* # peer lists */
+#define IBNAL_PEER_HASH_SIZE  101               /* # peer lists */
 
-#define OPENIBNAL_RESCHED         100             /* # scheduler loops before reschedule */
+#define IBNAL_RESCHED         100               /* # scheduler loops before reschedule */
 
-#define OPENIBNAL_CONCURRENT_PEERS 1000           /* # nodes all talking at once to me */
+#define IBNAL_CONCURRENT_PEERS 1000             /* # nodes all talking at once to me */
 
 /* default vals for runtime tunables */
-#define OPENIBNAL_IO_TIMEOUT      50              /* default comms timeout (seconds) */
+#define IBNAL_IO_TIMEOUT      50                /* default comms timeout (seconds) */
 
 /************************/
 /* derived constants... */
 
 /* TX messages (shared by all connections) */
-#define OPENIBNAL_TX_MSGS       (OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK)
-#define OPENIBNAL_TX_MSG_BYTES  (OPENIBNAL_TX_MSGS * OPENIBNAL_MSG_SIZE)
-#define OPENIBNAL_TX_MSG_PAGES  ((OPENIBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-
-/* we may have up to 2 completions per transmit */
-#define OPENIBNAL_TX_CQ_ENTRIES  (2*OPENIBNAL_TX_MSGS)
+#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
 /* RX messages (per connection) */
-#define OPENIBNAL_RX_MSGS       OPENIBNAL_MSG_QUEUE_SIZE
-#define OPENIBNAL_RX_MSG_BYTES  (OPENIBNAL_RX_MSGS * OPENIBNAL_MSG_SIZE)
-#define OPENIBNAL_RX_MSG_PAGES  ((OPENIBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
-/* 1 completion per receive, per connection */
-#define OPENIBNAL_RX_CQ_ENTRIES (OPENIBNAL_RX_MSGS * OPENIBNAL_CONCURRENT_PEERS)
+/* we may have up to 2 completions per transmit +
+   1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
+                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
 
-#define OPENIBNAL_RDMA_BASE  0x0eeb0000
-#define OPENIBNAL_FMR        1
-#define OPENIBNAL_CKSUM      0
-//#define OPENIBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
-#define OPENIBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
+#define IBNAL_RDMA_BASE  0x0eeb0000
+#define IBNAL_FMR        1
+#define IBNAL_CKSUM      0
+//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
 
 typedef struct 
 {
-        int               koib_io_timeout;      /* comms timeout (seconds) */
-        struct ctl_table_header *koib_sysctl;   /* sysctl interface */
-} koib_tunables_t;
+        int               kib_io_timeout;       /* comms timeout (seconds) */
+        struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+} kib_tunables_t;
 
 typedef struct
 {
-        int               oibp_npages;          /* # pages */
-        int               oibp_mapped;          /* mapped? */
-        __u64             oibp_vaddr;           /* mapped region vaddr */
-        __u32             oibp_lkey;            /* mapped region lkey */
-        __u32             oibp_rkey;            /* mapped region rkey */
-        struct ib_mr     *oibp_handle;          /* mapped region handle */
-        struct page      *oibp_pages[0];
-} koib_pages_t;
+        int               ibp_npages;           /* # pages */
+        int               ibp_mapped;           /* mapped? */
+        __u64             ibp_vaddr;            /* mapped region vaddr */
+        __u32             ibp_lkey;             /* mapped region lkey */
+        __u32             ibp_rkey;             /* mapped region rkey */
+        struct ib_mr     *ibp_handle;           /* mapped region handle */
+        struct page      *ibp_pages[0];
+} kib_pages_t;
         
 typedef struct 
 {
-        int               koib_init;            /* initialisation state */
-        __u64             koib_incarnation;     /* which one am I */
-        int               koib_shutdown;        /* shut down? */
-        atomic_t          koib_nthreads;        /* # live threads */
-
-        __u64             koib_cm_service_id;   /* service number I listen on */
-        ptl_nid_t         koib_nid;             /* my NID */
-        struct semaphore  koib_nid_mutex;       /* serialise NID ops */
-        struct semaphore  koib_nid_signal;      /* signal completion */
-
-        rwlock_t          koib_global_lock;     /* stabilize peer/conn ops */
-
-        struct list_head *koib_peers;           /* hash table of all my known peers */
-        int               koib_peer_hash_size;  /* size of koib_peers */
-        atomic_t          koib_npeers;          /* # peers extant */
-        atomic_t          koib_nconns;          /* # connections extant */
-
-        struct list_head  koib_connd_conns;     /* connections to progress */
-        struct list_head  koib_connd_peers;     /* peers waiting for a connection */
-        wait_queue_head_t koib_connd_waitq;     /* connection daemons sleep here */
-        unsigned long     koib_connd_waketime;  /* when connd will wake */
-        spinlock_t        koib_connd_lock;      /* serialise */
-
-        wait_queue_head_t koib_sched_waitq;     /* schedulers sleep here */
-        struct list_head  koib_sched_txq;       /* tx requiring attention */
-        struct list_head  koib_sched_rxq;       /* rx requiring attention */
-        spinlock_t        koib_sched_lock;      /* serialise */
+        int               kib_init;             /* initialisation state */
+        __u64             kib_incarnation;      /* which one am I */
+        int               kib_shutdown;         /* shut down? */
+        atomic_t          kib_nthreads;         /* # live threads */
+
+        __u64             kib_service_id;       /* service number I listen on */
+        ptl_nid_t         kib_nid;              /* my NID */
+        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
+        struct semaphore  kib_nid_signal;       /* signal completion */
+
+        rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
+
+        struct list_head *kib_peers;            /* hash table of all my known peers */
+        int               kib_peer_hash_size;   /* size of kib_peers */
+        atomic_t          kib_npeers;           /* # peers extant */
+        atomic_t          kib_nconns;           /* # connections extant */
+
+        struct list_head  kib_connd_conns;      /* connections to progress */
+        struct list_head  kib_connd_peers;      /* peers waiting for a connection */
+        wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
+        unsigned long     kib_connd_waketime;   /* when connd will wake */
+        spinlock_t        kib_connd_lock;       /* serialise */
+
+        wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
+        struct list_head  kib_sched_txq;        /* tx requiring attention */
+        struct list_head  kib_sched_rxq;        /* rx requiring attention */
+        spinlock_t        kib_sched_lock;       /* serialise */
         
-        struct koib_tx   *koib_tx_descs;        /* all the tx descriptors */
-        koib_pages_t     *koib_tx_pages;        /* premapped tx msg pages */
-
-        struct list_head  koib_idle_txs;        /* idle tx descriptors */
-        struct list_head  koib_idle_nblk_txs;   /* idle reserved tx descriptors */
-        wait_queue_head_t koib_idle_tx_waitq;   /* block here for tx descriptor */
-        __u64             koib_next_tx_cookie;  /* RDMA completion cookie */
-        spinlock_t        koib_tx_lock;         /* serialise */
+        struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
+        kib_pages_t      *kib_tx_pages;         /* premapped tx msg pages */
+
+        struct list_head  kib_idle_txs;         /* idle tx descriptors */
+        struct list_head  kib_idle_nblk_txs;    /* idle reserved tx descriptors */
+        wait_queue_head_t kib_idle_tx_waitq;    /* block here for tx descriptor */
+        __u64             kib_next_tx_cookie;   /* RDMA completion cookie */
+        spinlock_t        kib_tx_lock;          /* serialise */
         
-        struct ib_device *koib_device;          /* "the" device */
-        struct ib_device_properties koib_device_props; /* its properties */
-        int               koib_port;            /* port on the device */
-        struct ib_port_properties koib_port_props; /* its properties */
-        struct ib_pd     *koib_pd;              /* protection domain */
-#if OPENIBNAL_FMR
-        struct ib_fmr_pool *koib_fmr_pool;      /* fast memory region pool */
+        struct ib_device *kib_device;           /* "the" device */
+        struct ib_device_properties kib_device_props; /* its properties */
+        int               kib_port;             /* port on the device */
+        struct ib_port_properties kib_port_props; /* its properties */
+        struct ib_pd     *kib_pd;               /* protection domain */
+#if IBNAL_FMR
+        struct ib_fmr_pool *kib_fmr_pool;       /* fast memory region pool */
 #endif
-        struct ib_cq     *koib_rx_cq;           /* receive completion queue */
-        struct ib_cq     *koib_tx_cq;           /* transmit completion queue */
-        void             *koib_listen_handle;   /* where I listen for connections */
-        struct ib_common_attrib_service koib_service; /* SM service */
+        struct ib_cq     *kib_cq;               /* completion queue */
+        void             *kib_listen_handle;    /* where I listen for connections */
         
-} koib_data_t;
-
-#define OPENIBNAL_INIT_NOTHING         0
-#define OPENIBNAL_INIT_DATA            1
-#define OPENIBNAL_INIT_LIB             2
-#define OPENIBNAL_INIT_PD              3
-#define OPENIBNAL_INIT_FMR             4
-#define OPENIBNAL_INIT_TXD             5
-#define OPENIBNAL_INIT_RX_CQ           6
-#define OPENIBNAL_INIT_TX_CQ           7
-#define OPENIBNAL_INIT_ALL             8
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING         0
+#define IBNAL_INIT_DATA            1
+#define IBNAL_INIT_LIB             2
+#define IBNAL_INIT_PD              3
+#define IBNAL_INIT_FMR             4
+#define IBNAL_INIT_TXD             5
+#define IBNAL_INIT_CQ              6
+#define IBNAL_INIT_ALL             7
 
 /************************************************************************
  * Wire message structs.
@@ -214,125 +210,125 @@ typedef struct
         __u32             md_lkey;
         __u32             md_rkey;
         __u64             md_addr;
-} koib_md_t;
+} kib_md_t;
 
 typedef struct
 {
         __u32                 rd_key;           /* remote key */
         __u32                 rd_nob;           /* # of bytes */
         __u64                 rd_addr;          /* remote io vaddr */
-} koib_rdma_desc_t;
+} kib_rdma_desc_t;
 
 
 typedef struct
 {
-        ptl_hdr_t         oibim_hdr;            /* portals header */
-        char              oibim_payload[0];     /* piggy-backed payload */
-} koib_immediate_msg_t;
+        ptl_hdr_t         ibim_hdr;             /* portals header */
+        char              ibim_payload[0];      /* piggy-backed payload */
+} kib_immediate_msg_t;
 
 typedef struct
 {
-        ptl_hdr_t         oibrm_hdr;            /* portals header */
-        __u64             oibrm_cookie;         /* opaque completion cookie */
-        koib_rdma_desc_t  oibrm_desc;           /* where to suck/blow */
-} koib_rdma_msg_t;
+        ptl_hdr_t         ibrm_hdr;             /* portals header */
+        __u64             ibrm_cookie;          /* opaque completion cookie */
+        kib_rdma_desc_t   ibrm_desc;            /* where to suck/blow */
+} kib_rdma_msg_t;
 
 typedef struct
 {
-        __u64             oibcm_cookie;         /* opaque completion cookie */
-        __u32             oibcm_status;         /* completion status */
-} koib_completion_msg_t;
+        __u64             ibcm_cookie;          /* opaque completion cookie */
+        __u32             ibcm_status;          /* completion status */
+} kib_completion_msg_t;
 
 typedef struct
 {
-        __u32              oibm_magic;          /* I'm an openibnal message */
-        __u16              oibm_version;        /* this is my version number */
-        __u8               oibm_type;           /* msg type */
-        __u8               oibm_credits;        /* returned credits */
-#if OPENIBNAL_CKSUM
-        __u32              oibm_nob;
-        __u32              oibm_cksum;
+        __u32              ibm_magic;           /* I'm an openibnal message */
+        __u16              ibm_version;         /* this is my version number */
+        __u8               ibm_type;            /* msg type */
+        __u8               ibm_credits;         /* returned credits */
+#if IBNAL_CKSUM
+        __u32              ibm_nob;
+        __u32              ibm_cksum;
 #endif
         union {
-                koib_immediate_msg_t   immediate;
-                koib_rdma_msg_t        rdma;
-                koib_completion_msg_t  completion;
-        }                    oibm_u;
-} koib_msg_t;
-
-#define OPENIBNAL_MSG_MAGIC       0x0be91b91    /* unique magic */
-#define OPENIBNAL_MSG_VERSION              1    /* current protocol version */
-
-#define OPENIBNAL_MSG_NOOP              0xd0    /* nothing (just credits) */
-#define OPENIBNAL_MSG_IMMEDIATE         0xd1    /* portals hdr + payload */
-#define OPENIBNAL_MSG_PUT_RDMA          0xd2    /* portals PUT hdr + source rdma desc */
-#define OPENIBNAL_MSG_PUT_DONE          0xd3    /* signal PUT rdma completion */
-#define OPENIBNAL_MSG_GET_RDMA          0xd4    /* portals GET hdr + sink rdma desc */
-#define OPENIBNAL_MSG_GET_DONE          0xd5    /* signal GET rdma completion */
+                kib_immediate_msg_t   immediate;
+                kib_rdma_msg_t        rdma;
+                kib_completion_msg_t  completion;
+        }                    ibm_u;
+} kib_msg_t;
+
+#define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
+#define IBNAL_MSG_VERSION              1        /* current protocol version */
+
+#define IBNAL_MSG_NOOP              0xd0        /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE         0xd1        /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA          0xd2        /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE          0xd3        /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA          0xd4        /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE          0xd5        /* signal GET rdma completion */
 
 /***********************************************************************/
 
-typedef struct koib_rx                          /* receive message */
+typedef struct kib_rx                           /* receive message */
 {
         struct list_head          rx_list;      /* queue for attention */
-        struct koib_conn         *rx_conn;      /* owning conn */
+        struct kib_conn          *rx_conn;      /* owning conn */
         int                       rx_rdma;      /* RDMA completion posted? */
         int                       rx_posted;    /* posted? */
         __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
-        koib_msg_t               *rx_msg;       /* pre-mapped buffer (host vaddr) */
+        kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
         struct ib_receive_param   rx_sp;        /* receive work item */
         struct ib_gather_scatter  rx_gl;        /* and it's memory */
-} koib_rx_t;
+} kib_rx_t;
 
-typedef struct koib_tx                          /* transmit message */
+typedef struct kib_tx                           /* transmit message */
 {
         struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
         int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
-        struct koib_conn         *tx_conn;      /* owning conn */
+        struct kib_conn          *tx_conn;      /* owning conn */
         int                       tx_mapped;    /* mapped for RDMA? */
         int                       tx_sending;   /* # tx callbacks outstanding */
         int                       tx_status;    /* completion status */
-        int                       tx_passive_rdma; /* waiting for peer to RDMA? */
-        int                       tx_passive_rdma_wait; /* on ibc_rdma_queue */
-        unsigned long             tx_passive_rdma_deadline; /* completion deadline */
+        unsigned long             tx_deadline;  /* completion deadline */
+        int                       tx_passive_rdma; /* peer sucks/blows */
+        int                       tx_passive_rdma_wait; /* waiting for peer to complete */
         __u64                     tx_passive_rdma_cookie; /* completion cookie */
         lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
-        koib_md_t                 tx_md;        /* RDMA mapping (active/passive) */
+        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
         __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
-        koib_msg_t               *tx_msg;       /* pre-mapped buffer (host vaddr) */
+        kib_msg_t                *tx_msg;       /* pre-mapped buffer (host vaddr) */
         int                       tx_nsp;       /* # send work items */
         struct ib_send_param      tx_sp[2];     /* send work items... */
         struct ib_gather_scatter  tx_gl[2];     /* ...and their memory */
-} koib_tx_t;
+} kib_tx_t;
 
-#define KOIB_TX_UNMAPPED       0
-#define KOIB_TX_MAPPED         1
-#define KOIB_TX_MAPPED_FMR     2
+#define KIB_TX_UNMAPPED       0
+#define KIB_TX_MAPPED         1
+#define KIB_TX_MAPPED_FMR     2
 
-typedef struct koib_wire_connreq
+typedef struct kib_wire_connreq
 {
         __u32        wcr_magic;                 /* I'm an openibnal connreq */
         __u16        wcr_version;               /* this is my version number */
         __u16        wcr_queue_depth;           /* this is my receive queue size */
         __u64        wcr_nid;                   /* peer's NID */
         __u64        wcr_incarnation;           /* peer's incarnation */
-} koib_wire_connreq_t;
+} kib_wire_connreq_t;
 
-typedef struct koib_connreq
+typedef struct kib_connreq
 {
         /* connection-in-progress */
-        struct koib_conn                   *cr_conn;
-        koib_wire_connreq_t                 cr_wcr;
+        struct kib_conn                    *cr_conn;
+        kib_wire_connreq_t                  cr_wcr;
         __u64                               cr_tid;
         struct ib_common_attrib_service     cr_service;
         tTS_IB_GID                          cr_gid;
         struct ib_path_record               cr_path;
         struct ib_cm_active_param           cr_connparam;
-} koib_connreq_t;
+} kib_connreq_t;
 
-typedef struct koib_conn
+typedef struct kib_conn
 { 
-        struct koib_peer   *ibc_peer;           /* owning peer */
+        struct kib_peer    *ibc_peer;           /* owning peer */
         struct list_head    ibc_list;           /* stash on peer's conn list */
         __u64               ibc_incarnation;    /* which instance of the peer */
         atomic_t            ibc_refcount;       /* # users */
@@ -342,27 +338,27 @@ typedef struct koib_conn
         int                 ibc_credits;        /* # credits I have */
         int                 ibc_outstanding_credits; /* # credits to return */
         struct list_head    ibc_tx_queue;       /* send queue */
-        struct list_head    ibc_rdma_queue;     /* tx awaiting RDMA completion */
+        struct list_head    ibc_active_txs;     /* active tx awaiting completion */
         spinlock_t          ibc_lock;           /* serialise */
-        koib_rx_t          *ibc_rxs;            /* the rx descs */
-        koib_pages_t       *ibc_rx_pages;       /* premapped rx msg pages */
+        kib_rx_t           *ibc_rxs;            /* the rx descs */
+        kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
         struct ib_qp       *ibc_qp;             /* queue pair */
         __u32               ibc_qpn;            /* queue pair number */
         tTS_IB_CM_COMM_ID   ibc_comm_id;        /* connection ID? */
-        koib_connreq_t     *ibc_connreq;        /* connection request state */
-} koib_conn_t;
+        kib_connreq_t      *ibc_connreq;        /* connection request state */
+} kib_conn_t;
 
-#define OPENIBNAL_CONN_INIT_NOTHING      0      /* initial state */
-#define OPENIBNAL_CONN_INIT_QP           1      /* ibc_qp set up */
-#define OPENIBNAL_CONN_CONNECTING        2      /* started to connect */
-#define OPENIBNAL_CONN_ESTABLISHED       3      /* connection established */
-#define OPENIBNAL_CONN_DEATHROW          4      /* waiting to be closed */
-#define OPENIBNAL_CONN_ZOMBIE            5      /* waiting to be freed */
+#define IBNAL_CONN_INIT_NOTHING      0          /* initial state */
+#define IBNAL_CONN_INIT_QP           1          /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING        2          /* started to connect */
+#define IBNAL_CONN_ESTABLISHED       3          /* connection established */
+#define IBNAL_CONN_DEATHROW          4          /* waiting to be closed */
+#define IBNAL_CONN_ZOMBIE            5          /* waiting to be freed */
 
-typedef struct koib_peer
+typedef struct kib_peer
 {
         struct list_head    ibp_list;           /* stash on global peer list */
-        struct list_head    ibp_connd_list;     /* schedule on koib_connd_peers */
+        struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
         ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
         atomic_t            ibp_refcount;       /* # users */
         int                 ibp_persistence;    /* "known" peer refs */
@@ -371,30 +367,30 @@ typedef struct koib_peer
         int                 ibp_connecting;     /* connecting+accepting */
         unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
         unsigned long       ibp_reconnect_interval; /* exponential backoff */
-} koib_peer_t;
+} kib_peer_t;
 
 
-extern lib_nal_t        koibnal_lib;
-extern koib_data_t      koibnal_data;
-extern koib_tunables_t  koibnal_tunables;
+extern lib_nal_t       kibnal_lib;
+extern kib_data_t      kibnal_data;
+extern kib_tunables_t  kibnal_tunables;
 
 static inline struct list_head *
-koibnal_nid2peerlist (ptl_nid_t nid) 
+kibnal_nid2peerlist (ptl_nid_t nid) 
 {
-        unsigned int hash = ((unsigned int)nid) % koibnal_data.koib_peer_hash_size;
+        unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
         
-        return (&koibnal_data.koib_peers [hash]);
+        return (&kibnal_data.kib_peers [hash]);
 }
 
 static inline int
-koibnal_peer_active(koib_peer_t *peer)
+kibnal_peer_active(kib_peer_t *peer)
 {
         /* Am I in the peer hash table? */
         return (!list_empty(&peer->ibp_list));
 }
 
 static inline void
-koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn)
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
 {
         /* CAVEAT EMPTOR: tx takes caller's ref on conn */
 
@@ -402,40 +398,41 @@ koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn)
         LASSERT (tx->tx_conn == NULL);          /* only set here */
 
         tx->tx_conn = conn;
+        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
         list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
 }
 
-#define KOIBNAL_SERVICE_KEY_MASK  (IB_SA_SERVICE_COMP_MASK_NAME |       \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_1 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_2 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_3 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_4 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_5 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_6 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_7 |    \
-                                   IB_SA_SERVICE_COMP_MASK_DATA8_8)
+#define KIBNAL_SERVICE_KEY_MASK  (IB_SA_SERVICE_COMP_MASK_NAME |        \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_1 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_2 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_3 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_4 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_5 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_6 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_7 |     \
+                                  IB_SA_SERVICE_COMP_MASK_DATA8_8)
 
 static inline __u64*
-koibnal_service_nid_field(struct ib_common_attrib_service *srv)
+kibnal_service_nid_field(struct ib_common_attrib_service *srv)
 {
-        /* must be consistent with KOIBNAL_SERVICE_KEY_MASK */
+        /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
         return (__u64 *)srv->service_data8;
 }
 
 
 static inline void
-koibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid)
+kibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid)
 {
-        LASSERT (strlen (OPENIBNAL_SERVICE_NAME) < sizeof(srv->service_name));
+        LASSERT (strlen (IBNAL_SERVICE_NAME) < sizeof(srv->service_name));
         memset (srv->service_name, 0, sizeof(srv->service_name));
-        strcpy (srv->service_name, OPENIBNAL_SERVICE_NAME);
+        strcpy (srv->service_name, IBNAL_SERVICE_NAME);
 
-        *koibnal_service_nid_field(srv) = cpu_to_le64(nid);
+        *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
 }
 
 #if 0
 static inline void
-koibnal_show_rdma_attr (koib_conn_t *conn)
+kibnal_show_rdma_attr (kib_conn_t *conn)
 {
         struct ib_qp_attribute qp_attr;
         int                    rc;
@@ -457,7 +454,7 @@ koibnal_show_rdma_attr (koib_conn_t *conn)
 
 #if CONFIG_X86
 static inline __u64
-koibnal_page2phys (struct page *p)
+kibnal_page2phys (struct page *p)
 {
         __u64 page_number = p - mem_map;
         
@@ -467,42 +464,69 @@ koibnal_page2phys (struct page *p)
 # error "no page->phys"
 #endif
 
-extern koib_peer_t *koibnal_create_peer (ptl_nid_t nid);
-extern void koibnal_put_peer (koib_peer_t *peer);
-extern int koibnal_del_peer (ptl_nid_t nid, int single_share);
-extern koib_peer_t *koibnal_find_peer_locked (ptl_nid_t nid);
-extern void koibnal_unlink_peer_locked (koib_peer_t *peer);
-extern int  koibnal_close_stale_conns_locked (koib_peer_t *peer, 
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive.  It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+        unsigned long lptr = (unsigned long)ptr;
+
+        LASSERT ((lptr & 1) == 0);
+        return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+        return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+        return (wreqid & 1) != 0;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_put_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer, 
                                               __u64 incarnation);
-extern koib_conn_t *koibnal_create_conn (void);
-extern void koibnal_put_conn (koib_conn_t *conn);
-extern void koibnal_destroy_conn (koib_conn_t *conn);
-extern int koibnal_alloc_pages (koib_pages_t **pp, int npages, int access);
-extern void koibnal_free_pages (koib_pages_t *p);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
 
-extern void koibnal_check_sends (koib_conn_t *conn);
+extern void kibnal_check_sends (kib_conn_t *conn);
 
 extern tTS_IB_CM_CALLBACK_RETURN
-koibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
+kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
                        void *param, void *arg);
 extern tTS_IB_CM_CALLBACK_RETURN 
-koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
+kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
                                void *param, void *arg);
 
-extern void koibnal_close_conn_locked (koib_conn_t *conn, int error);
-extern void koibnal_destroy_conn (koib_conn_t *conn);
-extern int  koibnal_thread_start (int (*fn)(void *arg), void *arg);
-extern int  koibnal_scheduler(void *arg);
-extern int  koibnal_connd (void *arg);
-extern void koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
-extern void koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
-extern void koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob);
-extern int  koibnal_close_conn (koib_conn_t *conn, int why);
-extern void koibnal_start_active_rdma (int type, int status
-                                       koib_rx_t *rx, lib_msg_t *libmsg
-                                       unsigned int niov, 
-                                       struct iovec *iov, ptl_kiov_t *kiov,
-                                       size_t offset, size_t nob);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int  kibnal_scheduler(void *arg);
+extern int  kibnal_connd (void *arg);
+extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern int  kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status, 
+                                      kib_rx_t *rx, lib_msg_t *libmsg
+                                      unsigned int niov
+                                      struct iovec *iov, ptl_kiov_t *kiov,
+                                      size_t offset, size_t nob);
+
 
 
 
index 79bf37a..d774853 100644 (file)
  *
  */
 void
-koibnal_schedule_tx_done (koib_tx_t *tx)
+kibnal_schedule_tx_done (kib_tx_t *tx)
 {
         unsigned long flags;
 
-        spin_lock_irqsave (&koibnal_data.koib_sched_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
 
-        list_add_tail(&tx->tx_list, &koibnal_data.koib_sched_txq);
-        wake_up (&koibnal_data.koib_sched_waitq);
+        list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+        wake_up (&kibnal_data.kib_sched_waitq);
 
-        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
 }
 
 void
-koibnal_tx_done (koib_tx_t *tx)
+kibnal_tx_done (kib_tx_t *tx)
 {
         ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
         unsigned long    flags;
@@ -49,31 +49,31 @@ koibnal_tx_done (koib_tx_t *tx)
         int              rc;
 
         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
-        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be on ibc_rdma_queue */
+        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
 
         switch (tx->tx_mapped) {
         default:
                 LBUG();
 
-        case KOIB_TX_UNMAPPED:
+        case KIB_TX_UNMAPPED:
                 break;
                 
-        case KOIB_TX_MAPPED:
+        case KIB_TX_MAPPED:
                 if (in_interrupt()) {
                         /* can't deregister memory in IRQ context... */
-                        koibnal_schedule_tx_done(tx);
+                        kibnal_schedule_tx_done(tx);
                         return;
                 }
                 rc = ib_memory_deregister(tx->tx_md.md_handle.mr);
                 LASSERT (rc == 0);
-                tx->tx_mapped = KOIB_TX_UNMAPPED;
+                tx->tx_mapped = KIB_TX_UNMAPPED;
                 break;
 
-#if OPENIBNAL_FMR
-        case KOIB_TX_MAPPED_FMR:
+#if IBNAL_FMR
+        case KIB_TX_MAPPED_FMR:
                 if (in_interrupt() && tx->tx_status != 0) {
                         /* can't flush FMRs in IRQ context... */
-                        koibnal_schedule_tx_done(tx);
+                        kibnal_schedule_tx_done(tx);
                         return;
                 }              
 
@@ -81,8 +81,8 @@ koibnal_tx_done (koib_tx_t *tx)
                 LASSERT (rc == 0);
 
                 if (tx->tx_status != 0)
-                        ib_fmr_pool_force_flush(koibnal_data.koib_fmr_pool);
-                tx->tx_mapped = KOIB_TX_UNMAPPED;
+                        ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
                 break;
 #endif
         }
@@ -92,12 +92,12 @@ koibnal_tx_done (koib_tx_t *tx)
                 if (tx->tx_libmsg[i] == NULL)
                         continue;
 
-                lib_finalize (&koibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+                lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
                 tx->tx_libmsg[i] = NULL;
         }
         
         if (tx->tx_conn != NULL) {
-                koibnal_put_conn (tx->tx_conn);
+                kibnal_put_conn (tx->tx_conn);
                 tx->tx_conn = NULL;
         }
 
@@ -105,52 +105,52 @@ koibnal_tx_done (koib_tx_t *tx)
         tx->tx_passive_rdma = 0;
         tx->tx_status = 0;
 
-        spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
 
         if (tx->tx_isnblk) {
-                list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_nblk_txs);
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
         } else {
-                list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_txs);
-                wake_up (&koibnal_data.koib_idle_tx_waitq);
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+                wake_up (&kibnal_data.kib_idle_tx_waitq);
         }
 
-        spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
 }
 
-koib_tx_t *
-koibnal_get_idle_tx (int may_block) 
+kib_tx_t *
+kibnal_get_idle_tx (int may_block) 
 {
-        unsigned long    flags;
-        koib_tx_t    *tx = NULL;
+        unsigned long  flags;
+        kib_tx_t      *tx = NULL;
         
         for (;;) {
-                spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags);
+                spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
 
                 /* "normal" descriptor is free */
-                if (!list_empty (&koibnal_data.koib_idle_txs)) {
-                        tx = list_entry (koibnal_data.koib_idle_txs.next,
-                                         koib_tx_t, tx_list);
+                if (!list_empty (&kibnal_data.kib_idle_txs)) {
+                        tx = list_entry (kibnal_data.kib_idle_txs.next,
+                                         kib_tx_t, tx_list);
                         break;
                 }
 
                 if (!may_block) {
                         /* may dip into reserve pool */
-                        if (list_empty (&koibnal_data.koib_idle_nblk_txs)) {
+                        if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
                                 CERROR ("reserved tx desc pool exhausted\n");
                                 break;
                         }
 
-                        tx = list_entry (koibnal_data.koib_idle_nblk_txs.next,
-                                         koib_tx_t, tx_list);
+                        tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+                                         kib_tx_t, tx_list);
                         break;
                 }
 
                 /* block for idle tx */
-                spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+                spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
 
-                wait_event (koibnal_data.koib_idle_tx_waitq,
-                            !list_empty (&koibnal_data.koib_idle_txs) ||
-                            koibnal_data.koib_shutdown);
+                wait_event (kibnal_data.kib_idle_tx_waitq,
+                            !list_empty (&kibnal_data.kib_idle_txs) ||
+                            kibnal_data.kib_shutdown);
         }
 
         if (tx != NULL) {
@@ -159,9 +159,9 @@ koibnal_get_idle_tx (int may_block)
                 /* Allocate a new passive RDMA completion cookie.  It might
                  * not be needed, but we've got a lock right now and we're
                  * unlikely to wrap... */
-                tx->tx_passive_rdma_cookie = koibnal_data.koib_next_tx_cookie++;
+                tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
 
-                LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
                 LASSERT (tx->tx_nsp == 0);
                 LASSERT (tx->tx_sending == 0);
                 LASSERT (tx->tx_status == 0);
@@ -172,15 +172,15 @@ koibnal_get_idle_tx (int may_block)
                 LASSERT (tx->tx_libmsg[1] == NULL);
         }
 
-        spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
         
         return (tx);
 }
 
 int
-koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
 {
-        /* I would guess that if koibnal_get_peer (nid) == NULL,
+        /* I would guess that if kibnal_get_peer (nid) == NULL,
            and we're not routing, then 'nid' is very distant :) */
         if ( nal->libnal_ni.ni_pid.nid == nid ) {
                 *dist = 0;
@@ -192,7 +192,7 @@ koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
 }
 
 void
-koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
 {
         struct list_head *ttmp;
         unsigned long     flags;
@@ -200,30 +200,34 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
-        list_for_each (ttmp, &conn->ibc_rdma_queue) {
-                koib_tx_t *tx = list_entry(ttmp, koib_tx_t, tx_list);
-                
-                LASSERT (tx->tx_passive_rdma);
-                LASSERT (tx->tx_passive_rdma_wait);
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
 
-                if (tx->tx_passive_rdma_cookie != cookie)
-                        continue;
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
 
-                CDEBUG(D_NET, "Complete %p "LPD64"\n", tx, cookie);
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
 
-                list_del (&tx->tx_list);
+                if (!tx->tx_passive_rdma_wait ||
+                    tx->tx_passive_rdma_cookie != cookie)
+                        continue;
+
+                CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
 
+                tx->tx_status = status;
                 tx->tx_passive_rdma_wait = 0;
                 idle = (tx->tx_sending == 0);
 
-                tx->tx_status = status;
+                if (idle)
+                        list_del (&tx->tx_list);
 
                 spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
                 /* I could be racing with tx callbacks.  It's whoever
                  * _makes_ tx idle that frees it */
                 if (idle)
-                        koibnal_tx_done (tx);
+                        kibnal_tx_done (tx);
                 return;
         }
                 
@@ -234,32 +238,32 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
 }
 
 void
-koibnal_post_rx (koib_rx_t *rx, int do_credits)
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
 {
-        koib_conn_t  *conn = rx->rx_conn;
+        kib_conn_t   *conn = rx->rx_conn;
         int           rc;
         unsigned long flags;
 
         rx->rx_gl = (struct ib_gather_scatter) {
                 .address = rx->rx_vaddr,
-                .length  = OPENIBNAL_MSG_SIZE,
-                .key     = conn->ibc_rx_pages->oibp_lkey,
+                .length  = IBNAL_MSG_SIZE,
+                .key     = conn->ibc_rx_pages->ibp_lkey,
         };
-        
+
         rx->rx_sp = (struct ib_receive_param) {
-                .work_request_id        = (__u64)(unsigned long)rx,
+                .work_request_id        = kibnal_ptr2wreqid(rx, 1),
                 .scatter_list           = &rx->rx_gl,
                 .num_scatter_entries    = 1,
                 .device_specific        = NULL,
                 .signaled               = 1,
         };
 
-        LASSERT (conn->ibc_state >= OPENIBNAL_CONN_ESTABLISHED);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
         LASSERT (!rx->rx_posted);
         rx->rx_posted = 1;
         mb();
 
-        if (conn->ibc_state != OPENIBNAL_CONN_ESTABLISHED)
+        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
                 rc = -ECONNABORTED;
         else
                 rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1);
@@ -270,26 +274,26 @@ koibnal_post_rx (koib_rx_t *rx, int do_credits)
                         conn->ibc_outstanding_credits++;
                         spin_unlock_irqrestore(&conn->ibc_lock, flags);
 
-                        koibnal_check_sends(conn);
+                        kibnal_check_sends(conn);
                 }
                 return;
         }
 
-        if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
                 CERROR ("Error posting receive -> "LPX64": %d\n",
                         conn->ibc_peer->ibp_nid, rc);
-                koibnal_close_conn (rx->rx_conn, rc);
+                kibnal_close_conn (rx->rx_conn, rc);
         } else {
                 CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
                         conn->ibc_peer->ibp_nid, rc);
         }
 
         /* Drop rx's ref */
-        koibnal_put_conn (conn);
+        kibnal_put_conn (conn);
 }
 
-#if OPENIBNAL_CKSUM
-__u32 koibnal_cksum (void *ptr, int nob)
+#if IBNAL_CKSUM
+__u32 kibnal_cksum (void *ptr, int nob)
 {
         char  *c  = ptr;
         __u32  sum = 0;
@@ -302,17 +306,17 @@ __u32 koibnal_cksum (void *ptr, int nob)
 #endif
 
 void
-koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_rx_callback (struct ib_cq_entry *e)
 {
-        koib_rx_t    *rx = (koib_rx_t *)((unsigned long)e->work_request_id);
-        koib_msg_t   *msg = rx->rx_msg;
-        koib_conn_t  *conn = rx->rx_conn;
+        kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id);
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
         int           nob = e->bytes_transferred;
-        const int     base_nob = offsetof(koib_msg_t, oibm_u);
+        const int     base_nob = offsetof(kib_msg_t, ibm_u);
         int           credits;
         int           flipped;
         unsigned long flags;
-#if OPENIBNAL_CKSUM
+#if IBNAL_CKSUM
         __u32         msg_cksum;
         __u32         computed_cksum;
 #endif
@@ -324,11 +328,11 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
 
         /* receives complete with error in any case after we've started
          * closing the QP */
-        if (conn->ibc_state >= OPENIBNAL_CONN_DEATHROW)
+        if (conn->ibc_state >= IBNAL_CONN_DEATHROW)
                 goto failed;
 
         /* We don't post receives until the conn is established */
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED);
+        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
 
         if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
                 CERROR("Rx from "LPX64" failed: %d\n", 
@@ -344,35 +348,35 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
 
         /* Receiver does any byte flipping if necessary... */
 
-        if (msg->oibm_magic == OPENIBNAL_MSG_MAGIC) {
+        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
                 flipped = 0;
         } else {
-                if (msg->oibm_magic != __swab32(OPENIBNAL_MSG_MAGIC)) {
+                if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
                         CERROR ("Unrecognised magic: %08x from "LPX64"\n", 
-                                msg->oibm_magic, conn->ibc_peer->ibp_nid);
+                                msg->ibm_magic, conn->ibc_peer->ibp_nid);
                         goto failed;
                 }
                 flipped = 1;
-                __swab16s (&msg->oibm_version);
-                LASSERT (sizeof(msg->oibm_type) == 1);
-                LASSERT (sizeof(msg->oibm_credits) == 1);
+                __swab16s (&msg->ibm_version);
+                LASSERT (sizeof(msg->ibm_type) == 1);
+                LASSERT (sizeof(msg->ibm_credits) == 1);
         }
 
-        if (msg->oibm_version != OPENIBNAL_MSG_VERSION) {
+        if (msg->ibm_version != IBNAL_MSG_VERSION) {
                 CERROR ("Incompatible msg version %d (%d expected)\n",
-                        msg->oibm_version, OPENIBNAL_MSG_VERSION);
+                        msg->ibm_version, IBNAL_MSG_VERSION);
                 goto failed;
         }
 
-#if OPENIBNAL_CKSUM
-        if (nob != msg->oibm_nob) {
-                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->oibm_nob);
+#if IBNAL_CKSUM
+        if (nob != msg->ibm_nob) {
+                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
                 goto failed;
         }
 
-        msg_cksum = le32_to_cpu(msg->oibm_cksum);
-        msg->oibm_cksum = 0;
-        computed_cksum = koibnal_cksum (msg, nob);
+        msg_cksum = le32_to_cpu(msg->ibm_cksum);
+        msg->ibm_cksum = 0;
+        computed_cksum = kibnal_cksum (msg, nob);
         
         if (msg_cksum != computed_cksum) {
                 CERROR ("Checksum failure %d: (%d expected)\n",
@@ -383,101 +387,101 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
 #endif
 
         /* Have I received credits that will let me send? */
-        credits = msg->oibm_credits;
+        credits = msg->ibm_credits;
         if (credits != 0) {
                 spin_lock_irqsave(&conn->ibc_lock, flags);
                 conn->ibc_credits += credits;
                 spin_unlock_irqrestore(&conn->ibc_lock, flags);
                 
-                koibnal_check_sends(conn);
+                kibnal_check_sends(conn);
         }
 
-        switch (msg->oibm_type) {
-        case OPENIBNAL_MSG_NOOP:
-                koibnal_post_rx (rx, 1);
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_NOOP:
+                kibnal_post_rx (rx, 1);
                 return;
 
-        case OPENIBNAL_MSG_IMMEDIATE:
-                if (nob < base_nob + sizeof (koib_immediate_msg_t)) {
+        case IBNAL_MSG_IMMEDIATE:
+                if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
                         CERROR ("Short IMMEDIATE from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid, nob);
                         goto failed;
                 }
                 break;
                 
-        case OPENIBNAL_MSG_PUT_RDMA:
-        case OPENIBNAL_MSG_GET_RDMA:
-                if (nob < base_nob + sizeof (koib_rdma_msg_t)) {
+        case IBNAL_MSG_PUT_RDMA:
+        case IBNAL_MSG_GET_RDMA:
+                if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
                         CERROR ("Short RDMA msg from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid, nob);
                         goto failed;
                 }
                 if (flipped) {
-                        __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_key);
-                        __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_nob);
-                        __swab64s(&msg->oibm_u.rdma.oibrm_desc.rd_addr);
+                        __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
+                        __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
+                        __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
                 }
                 CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n",
-                       msg->oibm_type, msg->oibm_u.rdma.oibrm_cookie,
-                       msg->oibm_u.rdma.oibrm_desc.rd_key,
-                       msg->oibm_u.rdma.oibrm_desc.rd_addr,
-                       msg->oibm_u.rdma.oibrm_desc.rd_nob);
+                       msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie,
+                       msg->ibm_u.rdma.ibrm_desc.rd_key,
+                       msg->ibm_u.rdma.ibrm_desc.rd_addr,
+                       msg->ibm_u.rdma.ibrm_desc.rd_nob);
                 break;
                 
-        case OPENIBNAL_MSG_PUT_DONE:
-        case OPENIBNAL_MSG_GET_DONE:
-                if (nob < base_nob + sizeof (koib_completion_msg_t)) {
+        case IBNAL_MSG_PUT_DONE:
+        case IBNAL_MSG_GET_DONE:
+                if (nob < base_nob + sizeof (kib_completion_msg_t)) {
                         CERROR ("Short COMPLETION msg from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid, nob);
                         goto failed;
                 }
                 if (flipped)
-                        __swab32s(&msg->oibm_u.completion.oibcm_status);
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
                 
                 CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
-                       msg->oibm_type, msg->oibm_u.completion.oibcm_cookie,
-                       msg->oibm_u.completion.oibcm_status);
+                       msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+                       msg->ibm_u.completion.ibcm_status);
 
-                koibnal_complete_passive_rdma (conn, 
-                                               msg->oibm_u.completion.oibcm_cookie,
-                                               msg->oibm_u.completion.oibcm_status);
-                koibnal_post_rx (rx, 1);
+                kibnal_complete_passive_rdma (conn, 
+                                              msg->ibm_u.completion.ibcm_cookie,
+                                              msg->ibm_u.completion.ibcm_status);
+                kibnal_post_rx (rx, 1);
                 return;
                         
         default:
                 CERROR ("Can't parse type from "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, msg->oibm_type);
+                        conn->ibc_peer->ibp_nid, msg->ibm_type);
                 goto failed;
         }
 
-        /* schedule for koibnal_rx() in thread context */
-        spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags);
+        /* schedule for kibnal_rx() in thread context */
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
         
-        list_add_tail (&rx->rx_list, &koibnal_data.koib_sched_rxq);
-        wake_up (&koibnal_data.koib_sched_waitq);
+        list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+        wake_up (&kibnal_data.kib_sched_waitq);
         
-        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
         return;
         
  failed:
         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
-        koibnal_close_conn(conn, -ECONNABORTED);
+        kibnal_close_conn(conn, -ECONNABORTED);
 
         /* Don't re-post rx & drop its ref on conn */
-        koibnal_put_conn(conn);
+        kibnal_put_conn(conn);
 }
 
 void
-koibnal_rx (koib_rx_t *rx)
+kibnal_rx (kib_rx_t *rx)
 {
-        koib_msg_t   *msg = rx->rx_msg;
+        kib_msg_t   *msg = rx->rx_msg;
 
         /* Clear flag so I can detect if I've sent an RDMA completion */
         rx->rx_rdma = 0;
 
-        switch (msg->oibm_type) {
-        case OPENIBNAL_MSG_GET_RDMA:
-                lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx);
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_GET_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
                 /* If the incoming get was matched, I'll have initiated the
                  * RDMA and the completion message... */
                 if (rx->rx_rdma)
@@ -487,12 +491,12 @@ koibnal_rx (koib_rx_t *rx)
                  * the peer's GET blocking for the full timeout. */
                 CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
                         rx->rx_conn->ibc_peer->ibp_nid);
-                koibnal_start_active_rdma (OPENIBNAL_MSG_GET_DONE, -EIO,
-                                           rx, NULL, 0, NULL, NULL, 0, 0);
+                kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+                                          rx, NULL, 0, NULL, NULL, 0, 0);
                 break;
                 
-        case OPENIBNAL_MSG_PUT_RDMA:
-                lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx);
+        case IBNAL_MSG_PUT_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
                 if (rx->rx_rdma)
                         break;
                 /* This is most unusual, since even if lib_parse() didn't
@@ -505,8 +509,8 @@ koibnal_rx (koib_rx_t *rx)
                         rx->rx_conn->ibc_peer->ibp_nid);
                 break;
 
-        case OPENIBNAL_MSG_IMMEDIATE:
-                lib_parse(&koibnal_lib, &msg->oibm_u.immediate.oibim_hdr, rx);
+        case IBNAL_MSG_IMMEDIATE:
+                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
                 LASSERT (!rx->rx_rdma);
                 break;
                 
@@ -515,12 +519,12 @@ koibnal_rx (koib_rx_t *rx)
                 break;
         }
 
-        koibnal_post_rx (rx, 1);
+        kibnal_post_rx (rx, 1);
 }
 
 #if 0
 int
-koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
+kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
 {
         struct page *page;
 
@@ -531,7 +535,7 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
         else if (vaddr >= PKMAP_BASE &&
                  vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
                 page = vmalloc_to_page ((void *)vaddr);
-                /* in 2.4 ^ just walks the page tables */
+        /* in 2.4 ^ just walks the page tables */
 #endif
         else
                 page = virt_to_page (vaddr);
@@ -540,13 +544,13 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
             !VALID_PAGE (page))
                 return (-EFAULT);
 
-        *physp = koibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
+        *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
         return (0);
 }
 #endif
 
 int
-koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
+kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access,
                  int niov, struct iovec *iov, int offset, int nob)
                  
 {
@@ -555,7 +559,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
 
         LASSERT (nob > 0);
         LASSERT (niov > 0);
-        LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
 
         while (offset >= iov->iov_len) {
                 offset -= iov->iov_len;
@@ -572,7 +576,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
         vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
         tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
 
-        rc = ib_memory_register (koibnal_data.koib_pd,
+        rc = ib_memory_register (kibnal_data.kib_pd,
                                  vaddr, nob,
                                  access,
                                  &tx->tx_md.md_handle.mr,
@@ -584,21 +588,21 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
                 return (rc);
         }
 
-        tx->tx_mapped = KOIB_TX_MAPPED;
+        tx->tx_mapped = KIB_TX_MAPPED;
         return (0);
 }
 
 int
-koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
+kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
                   int nkiov, ptl_kiov_t *kiov,
                   int offset, int nob)
 {
-#if OPENIBNAL_FMR
+#if IBNAL_FMR
         __u64                      *phys;
-        const int                   mapped = KOIB_TX_MAPPED_FMR;
+        const int                   mapped = KIB_TX_MAPPED_FMR;
 #else
         struct ib_physical_buffer  *phys;
-        const int                   mapped = KOIB_TX_MAPPED;
+        const int                   mapped = KIB_TX_MAPPED;
 #endif
         int                         page_offset;
         int                         nphys;
@@ -610,7 +614,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
 
         LASSERT (nob > 0);
         LASSERT (nkiov > 0);
-        LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
 
         while (offset >= kiov->kiov_len) {
                 offset -= kiov->kiov_len;
@@ -627,10 +631,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
         }
 
         page_offset = kiov->kiov_offset + offset;
-#if OPENIBNAL_FMR
-        phys[0] = koibnal_page2phys(kiov->kiov_page);
+#if IBNAL_FMR
+        phys[0] = kibnal_page2phys(kiov->kiov_page);
 #else
-        phys[0].address = koibnal_page2phys(kiov->kiov_page);
+        phys[0].address = kibnal_page2phys(kiov->kiov_page);
         phys[0].size = PAGE_SIZE;
 #endif
         nphys = 1;
@@ -667,10 +671,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
                 }
 
                 LASSERT (nphys * sizeof (*phys) < phys_size);
-#if OPENIBNAL_FMR
-                phys[nphys] = koibnal_page2phys(kiov->kiov_page);
+#if IBNAL_FMR
+                phys[nphys] = kibnal_page2phys(kiov->kiov_page);
 #else
-                phys[nphys].address = koibnal_page2phys(kiov->kiov_page);
+                phys[nphys].address = kibnal_page2phys(kiov->kiov_page);
                 phys[nphys].size = PAGE_SIZE;
 #endif
                 nphys++;
@@ -683,10 +687,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
         for (rc = 0; rc < nphys; rc++)
                 CWARN ("   [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size);
 #endif
-        tx->tx_md.md_addr = OPENIBNAL_RDMA_BASE;
+        tx->tx_md.md_addr = IBNAL_RDMA_BASE;
 
-#if OPENIBNAL_FMR
-        rc = ib_fmr_register_physical (koibnal_data.koib_fmr_pool,
+#if IBNAL_FMR
+        rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
                                        phys, nphys,
                                        &tx->tx_md.md_addr,
                                        page_offset,
@@ -694,7 +698,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
                                        &tx->tx_md.md_lkey,
                                        &tx->tx_md.md_rkey);
 #else
-        rc = ib_memory_register_physical (koibnal_data.koib_pd,
+        rc = ib_memory_register_physical (kibnal_data.kib_pd,
                                           phys, nphys,
                                           &tx->tx_md.md_addr,
                                           nob, page_offset,
@@ -717,24 +721,24 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
         return (rc);
 }
 
-koib_conn_t *
-koibnal_find_conn_locked (koib_peer_t *peer)
+kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
 {
         struct list_head *tmp;
 
         /* just return the first connection */
         list_for_each (tmp, &peer->ibp_conns) {
-                return (list_entry(tmp, koib_conn_t, ibc_list));
+                return (list_entry(tmp, kib_conn_t, ibc_list));
         }
 
         return (NULL);
 }
 
 void
-koibnal_check_sends (koib_conn_t *conn)
+kibnal_check_sends (kib_conn_t *conn)
 {
         unsigned long   flags;
-        koib_tx_t      *tx;
+        kib_tx_t       *tx;
         int             rc;
         int             i;
         int             done;
@@ -742,39 +746,39 @@ koibnal_check_sends (koib_conn_t *conn)
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
+        LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
         if (list_empty(&conn->ibc_tx_queue) &&
-            conn->ibc_outstanding_credits >= OPENIBNAL_CREDIT_HIGHWATER) {
+            conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
                 spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
-                tx = koibnal_get_idle_tx(0);     /* don't block */
+                
+                tx = kibnal_get_idle_tx(0);     /* don't block */
                 if (tx != NULL)
-                        koibnal_init_tx_msg(tx, OPENIBNAL_MSG_NOOP, 0);
+                        kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
 
                 spin_lock_irqsave(&conn->ibc_lock, flags);
-
+                
                 if (tx != NULL) {
                         atomic_inc(&conn->ibc_refcount);
-                        koibnal_queue_tx_locked(tx, conn);
+                        kibnal_queue_tx_locked(tx, conn);
                 }
         }
 
-        LASSERT (conn->ibc_nsends_posted <= OPENIBNAL_MSG_QUEUE_SIZE);
-
         while (!list_empty (&conn->ibc_tx_queue)) {
-                tx = list_entry (conn->ibc_tx_queue.next, koib_tx_t, tx_list);
+                tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
 
                 /* We rely on this for QP sizing */
                 LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2);
 
                 LASSERT (conn->ibc_outstanding_credits >= 0);
-                LASSERT (conn->ibc_outstanding_credits <= OPENIBNAL_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
                 LASSERT (conn->ibc_credits >= 0);
-                LASSERT (conn->ibc_credits <= OPENIBNAL_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
 
                 /* Not on ibc_rdma_queue */
                 LASSERT (!tx->tx_passive_rdma_wait);
 
-                if (conn->ibc_nsends_posted == OPENIBNAL_MSG_QUEUE_SIZE)
+                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
                         break;
 
                 if (conn->ibc_credits == 0)     /* no credits */
@@ -786,37 +790,29 @@ koibnal_check_sends (koib_conn_t *conn)
 
                 list_del (&tx->tx_list);
 
-                if (tx->tx_msg->oibm_type == OPENIBNAL_MSG_NOOP &&
+                if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
                     (!list_empty(&conn->ibc_tx_queue) ||
-                     conn->ibc_outstanding_credits < OPENIBNAL_CREDIT_HIGHWATER)) {
-                        /* Redundant NOOP */
+                     conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+                        /* redundant NOOP */
                         spin_unlock_irqrestore(&conn->ibc_lock, flags);
-                        koibnal_tx_done(tx);
+                        kibnal_tx_done(tx);
                         spin_lock_irqsave(&conn->ibc_lock, flags);
                         continue;
                 }
-                
-                /* incoming RDMA completion can find this one now */
-                if (tx->tx_passive_rdma) {
-                        list_add (&tx->tx_list, &conn->ibc_rdma_queue);
-                        tx->tx_passive_rdma_wait = 1;
-                        tx->tx_passive_rdma_deadline = 
-                                jiffies + koibnal_tunables.koib_io_timeout * HZ;
-                }
 
-                tx->tx_msg->oibm_credits = conn->ibc_outstanding_credits;
+                tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
                 conn->ibc_outstanding_credits = 0;
 
-                /* use the free memory barrier when we unlock to ensure
-                 * sending set before we can get the tx callback. */
                 conn->ibc_nsends_posted++;
                 conn->ibc_credits--;
-                tx->tx_sending = tx->tx_nsp;
 
-#if OPENIBNAL_CKSUM
-                tx->tx_msg->oibm_cksum = 0;
-                tx->tx_msg->oibm_cksum = koibnal_cksum(tx->tx_msg, tx->tx_msg->oibm_nob);
-                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->oibm_cksum, tx->tx_msg->oibm_nob);
+                tx->tx_sending = tx->tx_nsp;
+                tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+                list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+                tx->tx_msg->ibm_cksum = 0;
+                tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
 #endif
                 spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
@@ -827,7 +823,7 @@ koibnal_check_sends (koib_conn_t *conn)
 
                 rc = -ECONNABORTED;
                 nwork = 0;
-                if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
                         tx->tx_status = 0;
                         /* Driver only accepts 1 item at a time */
                         for (i = 0; i < tx->tx_nsp; i++) {
@@ -842,31 +838,31 @@ koibnal_check_sends (koib_conn_t *conn)
                 if (rc != 0) {
                         /* NB credits are transferred in the actual
                          * message, which can only be the last work item */
-                        conn->ibc_outstanding_credits += tx->tx_msg->oibm_credits;
+                        conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
                         conn->ibc_credits++;
                         conn->ibc_nsends_posted--;
-                        tx->tx_sending -= tx->tx_nsp - nwork;
+
                         tx->tx_status = rc;
+                        tx->tx_passive_rdma_wait = 0;
+                        tx->tx_sending -= tx->tx_nsp - nwork;
+
                         done = (tx->tx_sending == 0);
-                        
-                        if (tx->tx_passive_rdma) {
-                                tx->tx_passive_rdma_wait = 0;
+                        if (done)
                                 list_del (&tx->tx_list);
-                        }
                         
                         spin_unlock_irqrestore (&conn->ibc_lock, flags);
                         
-                        if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED)
+                        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
                                 CERROR ("Error %d posting transmit to "LPX64"\n", 
                                         rc, conn->ibc_peer->ibp_nid);
                         else
                                 CDEBUG (D_NET, "Error %d posting transmit to "
                                         LPX64"\n", rc, conn->ibc_peer->ibp_nid);
 
-                        koibnal_close_conn (conn, rc);
+                        kibnal_close_conn (conn, rc);
 
                         if (done)
-                                koibnal_tx_done (tx);
+                                kibnal_tx_done (tx);
                         return;
                 }
                 
@@ -876,10 +872,10 @@ koibnal_check_sends (koib_conn_t *conn)
 }
 
 void
-koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_tx_callback (struct ib_cq_entry *e)
 {
-        koib_tx_t    *tx = (koib_tx_t *)((unsigned long)e->work_request_id);
-        koib_conn_t  *conn;
+        kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id);
+        kib_conn_t   *conn;
         unsigned long flags;
         int           idle;
 
@@ -901,6 +897,8 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
         tx->tx_sending--;
         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
                (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
+        if (idle)
+                list_del(&tx->tx_list);
 
         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
@@ -917,53 +915,62 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
         spin_unlock_irqrestore(&conn->ibc_lock, flags);
 
         if (idle)
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
 
         if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
                 CERROR ("Tx completion to "LPX64" failed: %d\n", 
                         conn->ibc_peer->ibp_nid, e->status);
-                koibnal_close_conn (conn, -ENETDOWN);
+                kibnal_close_conn (conn, -ENETDOWN);
         } else {
                 /* can I shovel some more sends out the door? */
-                koibnal_check_sends(conn);
+                kibnal_check_sends(conn);
         }
 
-        koibnal_put_conn (conn);
+        kibnal_put_conn (conn);
 }
 
 void
-koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob)
+kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+{
+        if (kibnal_wreqid_is_rx(e->work_request_id))
+                kibnal_rx_callback (e);
+        else
+                kibnal_tx_callback (e);
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
 {
         struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp];
         struct ib_send_param     *sp = &tx->tx_sp[tx->tx_nsp];
         int                       fence;
-        int                       nob = offsetof (koib_msg_t, oibm_u) + body_nob;
+        int                       nob = offsetof (kib_msg_t, ibm_u) + body_nob;
 
         LASSERT (tx->tx_nsp >= 0 && 
                  tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0]));
-        LASSERT (nob <= OPENIBNAL_MSG_SIZE);
+        LASSERT (nob <= IBNAL_MSG_SIZE);
         
-        tx->tx_msg->oibm_magic = OPENIBNAL_MSG_MAGIC;
-        tx->tx_msg->oibm_version = OPENIBNAL_MSG_VERSION;
-        tx->tx_msg->oibm_type = type;
-#if OPENIBNAL_CKSUM
-        tx->tx_msg->oibm_nob = nob;
+        tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+        tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+        tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+        tx->tx_msg->ibm_nob = nob;
 #endif
         /* Fence the message if it's bundled with an RDMA read */
         fence = (tx->tx_nsp > 0) &&
-                (type == OPENIBNAL_MSG_PUT_DONE);
+                (type == IBNAL_MSG_PUT_DONE);
 
         *gl = (struct ib_gather_scatter) {
                 .address = tx->tx_vaddr,
                 .length  = nob,
-                .key     = koibnal_data.koib_tx_pages->oibp_lkey,
+                .key     = kibnal_data.kib_tx_pages->ibp_lkey,
         };
 
         /* NB If this is an RDMA read, the completion message must wait for
          * the RDMA to complete.  Sends wait for previous RDMA writes
          * anyway... */
         *sp = (struct ib_send_param) {
-                .work_request_id      = (__u64)((unsigned long)tx),
+                .work_request_id      = kibnal_ptr2wreqid(tx, 0),
                 .op                   = IB_OP_SEND,
                 .gather_list          = gl,
                 .num_gather_entries   = 1,
@@ -979,26 +986,26 @@ koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob)
 }
 
 void
-koibnal_queue_tx (koib_tx_t *tx, koib_conn_t *conn)
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
 {
         unsigned long         flags;
 
         spin_lock_irqsave(&conn->ibc_lock, flags);
 
-        koibnal_queue_tx_locked (tx, conn);
+        kibnal_queue_tx_locked (tx, conn);
         
         spin_unlock_irqrestore(&conn->ibc_lock, flags);
         
-        koibnal_check_sends(conn);
+        kibnal_check_sends(conn);
 }
 
 void
-koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
 {
         unsigned long    flags;
-        koib_peer_t     *peer;
-        koib_conn_t     *conn;
-        rwlock_t        *g_lock = &koibnal_data.koib_global_lock;
+        kib_peer_t      *peer;
+        kib_conn_t      *conn;
+        rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
 
         /* If I get here, I've committed to send, so I complete the tx with
          * failure on any problems */
@@ -1008,15 +1015,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
 
         read_lock (g_lock);
         
-        peer = koibnal_find_peer_locked (nid);
+        peer = kibnal_find_peer_locked (nid);
         if (peer == NULL) {
                 read_unlock (g_lock);
                 tx->tx_status = -EHOSTUNREACH;
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
                 return;
         }
 
-        conn = koibnal_find_conn_locked (peer);
+        conn = kibnal_find_conn_locked (peer);
         if (conn != NULL) {
                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
@@ -1024,7 +1031,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
                 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
                 read_unlock (g_lock);
                 
-                koibnal_queue_tx (tx, conn);
+                kibnal_queue_tx (tx, conn);
                 return;
         }
         
@@ -1032,15 +1039,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
         read_unlock (g_lock);
         write_lock_irqsave (g_lock, flags);
 
-        peer = koibnal_find_peer_locked (nid);
+        peer = kibnal_find_peer_locked (nid);
         if (peer == NULL) {
                 write_unlock_irqrestore (g_lock, flags);
                 tx->tx_status = -EHOSTUNREACH;
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
                 return;
         }
 
-        conn = koibnal_find_conn_locked (peer);
+        conn = kibnal_find_conn_locked (peer);
         if (conn != NULL) {
                 /* Connection exists; queue message on it */
                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
@@ -1049,7 +1056,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
                 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
                 write_unlock_irqrestore (g_lock, flags);
                 
-                koibnal_queue_tx (tx, conn);
+                kibnal_queue_tx (tx, conn);
                 return;
         }
 
@@ -1057,20 +1064,20 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
                 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
                         write_unlock_irqrestore (g_lock, flags);
                         tx->tx_status = -EHOSTUNREACH;
-                        koibnal_tx_done (tx);
+                        kibnal_tx_done (tx);
                         return;
                 }
         
                 peer->ibp_connecting = 1;
                 atomic_inc (&peer->ibp_refcount); /* extra ref for connd */
         
-                spin_lock (&koibnal_data.koib_connd_lock);
+                spin_lock (&kibnal_data.kib_connd_lock);
         
                 list_add_tail (&peer->ibp_connd_list,
-                               &koibnal_data.koib_connd_peers);
-                wake_up (&koibnal_data.koib_connd_waitq);
+                               &kibnal_data.kib_connd_peers);
+                wake_up (&kibnal_data.kib_connd_waitq);
         
-                spin_unlock (&koibnal_data.koib_connd_lock);
+                spin_unlock (&kibnal_data.kib_connd_lock);
         }
         
         /* A connection is being established; queue the message... */
@@ -1080,49 +1087,49 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
 }
 
 ptl_err_t
-koibnal_start_passive_rdma (int type, ptl_nid_t nid,
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
                             lib_msg_t *libmsg, ptl_hdr_t *hdr)
 {
         int         nob = libmsg->md->length;
-        koib_tx_t  *tx;
-        koib_msg_t *oibmsg;
+        kib_tx_t   *tx;
+        kib_msg_t  *ibmsg;
         int         rc;
         int         access;
         
-        LASSERT (type == OPENIBNAL_MSG_PUT_RDMA || 
-                 type == OPENIBNAL_MSG_GET_RDMA);
+        LASSERT (type == IBNAL_MSG_PUT_RDMA || 
+                 type == IBNAL_MSG_GET_RDMA);
         LASSERT (nob > 0);
         LASSERT (!in_interrupt());              /* Mapping could block */
 
-        if (type == OPENIBNAL_MSG_PUT_RDMA) {
+        if (type == IBNAL_MSG_PUT_RDMA) {
                 access = IB_ACCESS_REMOTE_READ;
         } else {
                 access = IB_ACCESS_REMOTE_WRITE |
                          IB_ACCESS_LOCAL_WRITE;
         }
 
-        tx = koibnal_get_idle_tx (1);           /* May block; caller is an app thread */
+        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
         LASSERT (tx != NULL);
 
         if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
-                rc = koibnal_map_iov (tx, access,
-                                      libmsg->md->md_niov,
-                                      libmsg->md->md_iov.iov,
-                                      0, nob);
+                rc = kibnal_map_iov (tx, access,
+                                     libmsg->md->md_niov,
+                                     libmsg->md->md_iov.iov,
+                                     0, nob);
         else
-                rc = koibnal_map_kiov (tx, access,
-                                       libmsg->md->md_niov, 
-                                       libmsg->md->md_iov.kiov,
-                                       0, nob);
+                rc = kibnal_map_kiov (tx, access,
+                                      libmsg->md->md_niov, 
+                                      libmsg->md->md_iov.kiov,
+                                      0, nob);
 
         if (rc != 0) {
                 CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
                 goto failed;
         }
         
-        if (type == OPENIBNAL_MSG_GET_RDMA) {
+        if (type == IBNAL_MSG_GET_RDMA) {
                 /* reply gets finalized when tx completes */
-                tx->tx_libmsg[1] = lib_create_reply_msg(&koibnal_lib, 
+                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, 
                                                         nid, libmsg);
                 if (tx->tx_libmsg[1] == NULL) {
                         CERROR ("Can't create reply for GET -> "LPX64"\n",
@@ -1134,15 +1141,15 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid,
         
         tx->tx_passive_rdma = 1;
 
-        oibmsg = tx->tx_msg;
+        ibmsg = tx->tx_msg;
 
-        oibmsg->oibm_u.rdma.oibrm_hdr = *hdr;
-        oibmsg->oibm_u.rdma.oibrm_cookie = tx->tx_passive_rdma_cookie;
-        oibmsg->oibm_u.rdma.oibrm_desc.rd_key = tx->tx_md.md_rkey;
-        oibmsg->oibm_u.rdma.oibrm_desc.rd_addr = tx->tx_md.md_addr;
-        oibmsg->oibm_u.rdma.oibrm_desc.rd_nob = nob;
+        ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+        ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+        ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey;
+        ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr;
+        ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob;
 
-        koibnal_init_tx_msg (tx, type, sizeof (koib_rdma_msg_t));
+        kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t));
 
         CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
                LPX64", nob %d\n",
@@ -1152,25 +1159,25 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid,
         /* libmsg gets finalized when tx completes. */
         tx->tx_libmsg[0] = libmsg;
 
-        koibnal_launch_tx(tx, nid);
+        kibnal_launch_tx(tx, nid);
         return (PTL_OK);
 
  failed:
         tx->tx_status = rc;
-        koibnal_tx_done (tx);
+        kibnal_tx_done (tx);
         return (PTL_FAIL);
 }
 
 void
-koibnal_start_active_rdma (int type, int status,
-                           koib_rx_t *rx, lib_msg_t *libmsg, 
+kibnal_start_active_rdma (int type, int status,
+                           kib_rx_t *rx, lib_msg_t *libmsg, 
                            unsigned int niov,
                            struct iovec *iov, ptl_kiov_t *kiov,
                            size_t offset, size_t nob)
 {
-        koib_msg_t   *rxmsg = rx->rx_msg;
-        koib_msg_t   *txmsg;
-        koib_tx_t    *tx;
+        kib_msg_t    *rxmsg = rx->rx_msg;
+        kib_msg_t    *txmsg;
+        kib_tx_t     *tx;
         int           access;
         int           rdma_op;
         int           rc;
@@ -1187,8 +1194,8 @@ koibnal_start_active_rdma (int type, int status,
         /* No data if we're completing with failure */
         LASSERT (status == 0 || nob == 0);
 
-        LASSERT (type == OPENIBNAL_MSG_GET_DONE ||
-                 type == OPENIBNAL_MSG_PUT_DONE);
+        LASSERT (type == IBNAL_MSG_GET_DONE ||
+                 type == IBNAL_MSG_PUT_DONE);
 
         /* Flag I'm completing the RDMA.  Even if I fail to send the
          * completion message, I will have tried my best so further
@@ -1196,22 +1203,22 @@ koibnal_start_active_rdma (int type, int status,
         LASSERT (!rx->rx_rdma);
         rx->rx_rdma = 1;
 
-        if (type == OPENIBNAL_MSG_GET_DONE) {
+        if (type == IBNAL_MSG_GET_DONE) {
                 access   = 0;
                 rdma_op  = IB_OP_RDMA_WRITE;
-                LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_GET_RDMA);
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
         } else {
                 access   = IB_ACCESS_LOCAL_WRITE;
                 rdma_op  = IB_OP_RDMA_READ;
-                LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_PUT_RDMA);
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
         }
 
-        tx = koibnal_get_idle_tx (0);           /* Mustn't block */
+        tx = kibnal_get_idle_tx (0);           /* Mustn't block */
         if (tx == NULL) {
                 CERROR ("tx descs exhausted on RDMA from "LPX64
                         " completing locally with failure\n",
-                         rx->rx_conn->ibc_peer->ibp_nid);
-                lib_finalize (&koibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
                 return;
         }
         LASSERT (tx->tx_nsp == 0);
@@ -1222,11 +1229,11 @@ koibnal_start_active_rdma (int type, int status,
                  * message is matched) */
 
                 if (kiov != NULL)
-                        rc = koibnal_map_kiov (tx, access,
-                                               niov, kiov, offset, nob);
+                        rc = kibnal_map_kiov (tx, access,
+                                              niov, kiov, offset, nob);
                 else
-                        rc = koibnal_map_iov (tx, access,
-                                              niov, iov, offset, nob);
+                        rc = kibnal_map_iov (tx, access,
+                                             niov, iov, offset, nob);
                 
                 if (rc != 0) {
                         CERROR ("Can't map RDMA -> "LPX64": %d\n", 
@@ -1242,12 +1249,12 @@ koibnal_start_active_rdma (int type, int status,
                         };
                 
                         tx->tx_sp[0] = (struct ib_send_param) {
-                                .work_request_id      = (__u64)((unsigned long)tx),
+                                .work_request_id      = kibnal_ptr2wreqid(tx, 0),
                                 .op                   = rdma_op,
                                 .gather_list          = &tx->tx_gl[0],
                                 .num_gather_entries   = 1,
-                                .remote_address       = rxmsg->oibm_u.rdma.oibrm_desc.rd_addr,
-                                .rkey                 = rxmsg->oibm_u.rdma.oibrm_desc.rd_key,
+                                .remote_address       = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr,
+                                .rkey                 = rxmsg->ibm_u.rdma.ibrm_desc.rd_key,
                                 .device_specific      = NULL,
                                 .solicited_event      = 0,
                                 .signaled             = 1,
@@ -1262,10 +1269,10 @@ koibnal_start_active_rdma (int type, int status,
 
         txmsg = tx->tx_msg;
 
-        txmsg->oibm_u.completion.oibcm_cookie = rxmsg->oibm_u.rdma.oibrm_cookie;
-        txmsg->oibm_u.completion.oibcm_status = status;
+        txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+        txmsg->ibm_u.completion.ibcm_status = status;
         
-        koibnal_init_tx_msg(tx, type, sizeof (koib_completion_msg_t));
+        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
 
         if (status == 0 && nob != 0) {
                 LASSERT (tx->tx_nsp > 1);
@@ -1277,7 +1284,7 @@ koibnal_start_active_rdma (int type, int status,
                 LASSERT (tx->tx_nsp == 1);
                 /* No RDMA: local completion happens now! */
                 CDEBUG(D_WARNING,"No data: immediate completion\n");
-                lib_finalize (&koibnal_lib, NULL, libmsg,
+                lib_finalize (&kibnal_lib, NULL, libmsg,
                               status == 0 ? PTL_OK : PTL_FAIL);
         }
 
@@ -1288,11 +1295,11 @@ koibnal_start_active_rdma (int type, int status,
                atomic_read (&rx->rx_conn->ibc_refcount));
         atomic_inc (&rx->rx_conn->ibc_refcount);
         /* ...and queue it up */
-        koibnal_queue_tx(tx, rx->rx_conn);
+        kibnal_queue_tx(tx, rx->rx_conn);
 }
 
 ptl_err_t
-koibnal_sendmsg(lib_nal_t    *nal, 
+kibnal_sendmsg(lib_nal_t    *nal, 
                 void         *private,
                 lib_msg_t    *libmsg,
                 ptl_hdr_t    *hdr, 
@@ -1305,8 +1312,8 @@ koibnal_sendmsg(lib_nal_t    *nal,
                 size_t        payload_offset,
                 size_t        payload_nob)
 {
-        koib_msg_t *oibmsg;
-        koib_tx_t  *tx;
+        kib_msg_t  *ibmsg;
+        kib_tx_t   *tx;
         int         nob;
 
         /* NB 'private' is different depending on what we're sending.... */
@@ -1329,27 +1336,27 @@ koibnal_sendmsg(lib_nal_t    *nal,
                 
         case PTL_MSG_REPLY: {
                 /* reply's 'private' is the incoming receive */
-                koib_rx_t *rx = private;
+                kib_rx_t *rx = private;
 
                 /* RDMA reply expected? */
-                if (rx->rx_msg->oibm_type == OPENIBNAL_MSG_GET_RDMA) {
-                        koibnal_start_active_rdma(OPENIBNAL_MSG_GET_DONE, 0,
-                                                  rx, libmsg, payload_niov, 
-                                                  payload_iov, payload_kiov,
-                                                  payload_offset, payload_nob);
+                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+                                                 rx, libmsg, payload_niov, 
+                                                 payload_iov, payload_kiov,
+                                                 payload_offset, payload_nob);
                         return (PTL_OK);
                 }
                 
                 /* Incoming message consistent with immediate reply? */
-                if (rx->rx_msg->oibm_type != OPENIBNAL_MSG_IMMEDIATE) {
+                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
                         CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
-                                nid, rx->rx_msg->oibm_type);
+                                nid, rx->rx_msg->ibm_type);
                         return (PTL_FAIL);
                 }
 
                 /* Will it fit in a message? */
-                nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]);
-                if (nob >= OPENIBNAL_MSG_SIZE) {
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob >= IBNAL_MSG_SIZE) {
                         CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", 
                                nid, payload_nob);
                         return (PTL_FAIL);
@@ -1359,10 +1366,10 @@ koibnal_sendmsg(lib_nal_t    *nal,
 
         case PTL_MSG_GET:
                 /* might the REPLY message be big enough to need RDMA? */
-                nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[libmsg->md->length]);
-                if (nob > OPENIBNAL_MSG_SIZE)
-                        return (koibnal_start_passive_rdma(OPENIBNAL_MSG_GET_RDMA, 
-                                                           nid, libmsg, hdr));
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
+                                                          nid, libmsg, hdr));
                 break;
 
         case PTL_MSG_ACK:
@@ -1371,181 +1378,181 @@ koibnal_sendmsg(lib_nal_t    *nal,
 
         case PTL_MSG_PUT:
                 /* Is the payload big enough to need RDMA? */
-                nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]);
-                if (nob > OPENIBNAL_MSG_SIZE)
-                        return (koibnal_start_passive_rdma(OPENIBNAL_MSG_PUT_RDMA,
-                                                           nid, libmsg, hdr));
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+                                                          nid, libmsg, hdr));
                 
                 break;
         }
 
-        tx = koibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
-                                   type == PTL_MSG_REPLY ||
-                                   in_interrupt()));
+        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt()));
         if (tx == NULL) {
                 CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
                         type, nid, in_interrupt() ? " (intr)" : "");
                 return (PTL_NO_SPACE);
         }
 
-        oibmsg = tx->tx_msg;
-        oibmsg->oibm_u.immediate.oibim_hdr = *hdr;
+        ibmsg = tx->tx_msg;
+        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
 
         if (payload_nob > 0) {
                 if (payload_kiov != NULL)
-                        lib_copy_kiov2buf(oibmsg->oibm_u.immediate.oibim_payload,
+                        lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
                                           payload_niov, payload_kiov,
                                           payload_offset, payload_nob);
                 else
-                        lib_copy_iov2buf(oibmsg->oibm_u.immediate.oibim_payload,
+                        lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
                                          payload_niov, payload_iov,
                                          payload_offset, payload_nob);
         }
 
-        koibnal_init_tx_msg (tx, OPENIBNAL_MSG_IMMEDIATE,
-                             offsetof(koib_immediate_msg_t, 
-                                      oibim_payload[payload_nob]));
+        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+                            offsetof(kib_immediate_msg_t, 
+                                     ibim_payload[payload_nob]));
 
         /* libmsg gets finalized when tx completes */
         tx->tx_libmsg[0] = libmsg;
 
-        koibnal_launch_tx(tx, nid);
+        kibnal_launch_tx(tx, nid);
         return (PTL_OK);
 }
 
 ptl_err_t
-koibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                unsigned int payload_niov, struct iovec *payload_iov,
                size_t payload_offset, size_t payload_len)
 {
-        return (koibnal_sendmsg(nal, private, cookie,
-                                 hdr, type, nid, pid,
-                                 payload_niov, payload_iov, NULL,
-                                 payload_offset, payload_len));
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, payload_iov, NULL,
+                               payload_offset, payload_len));
 }
 
 ptl_err_t
-koibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
                      size_t payload_offset, size_t payload_len)
 {
-        return (koibnal_sendmsg(nal, private, cookie,
-                                 hdr, type, nid, pid,
-                                 payload_niov, NULL, payload_kiov,
-                                 payload_offset, payload_len));
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, NULL, payload_kiov,
+                               payload_offset, payload_len));
 }
 
 ptl_err_t
-koibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
                  unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
                  size_t offset, size_t mlen, size_t rlen)
 {
-        koib_rx_t                *rx = private;
-        koib_msg_t               *rxmsg = rx->rx_msg;
-        int                       msg_nob;
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        int          msg_nob;
         
         LASSERT (mlen <= rlen);
         LASSERT (!in_interrupt ());
         /* Either all pages or all vaddrs */
         LASSERT (!(kiov != NULL && iov != NULL));
 
-        switch (rxmsg->oibm_type) {
+        switch (rxmsg->ibm_type) {
         default:
                 LBUG();
                 return (PTL_FAIL);
                 
-        case OPENIBNAL_MSG_IMMEDIATE:
-                msg_nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[rlen]);
-                if (msg_nob > OPENIBNAL_MSG_SIZE) {
+        case IBNAL_MSG_IMMEDIATE:
+                msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (msg_nob > IBNAL_MSG_SIZE) {
                         CERROR ("Immediate message from "LPX64" too big: %d\n",
-                                rxmsg->oibm_u.immediate.oibim_hdr.src_nid, rlen);
+                                rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
                         return (PTL_FAIL);
                 }
 
                 if (kiov != NULL)
                         lib_copy_buf2kiov(niov, kiov, offset,
-                                          rxmsg->oibm_u.immediate.oibim_payload,
+                                          rxmsg->ibm_u.immediate.ibim_payload,
                                           mlen);
                 else
                         lib_copy_buf2iov(niov, iov, offset,
-                                         rxmsg->oibm_u.immediate.oibim_payload,
+                                         rxmsg->ibm_u.immediate.ibim_payload,
                                          mlen);
 
                 lib_finalize (nal, NULL, libmsg, PTL_OK);
                 return (PTL_OK);
 
-        case OPENIBNAL_MSG_GET_RDMA:
+        case IBNAL_MSG_GET_RDMA:
                 /* We get called here just to discard any junk after the
                  * GET hdr. */
                 LASSERT (libmsg == NULL);
                 lib_finalize (nal, NULL, libmsg, PTL_OK);
                 return (PTL_OK);
 
-        case OPENIBNAL_MSG_PUT_RDMA:
-                koibnal_start_active_rdma (OPENIBNAL_MSG_PUT_DONE, 0,
-                                           rx, libmsg, 
-                                           niov, iov, kiov, offset, mlen);
+        case IBNAL_MSG_PUT_RDMA:
+                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+                                          rx, libmsg, 
+                                          niov, iov, kiov, offset, mlen);
                 return (PTL_OK);
         }
 }
 
 ptl_err_t
-koibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
               unsigned int niov, struct iovec *iov, 
               size_t offset, size_t mlen, size_t rlen)
 {
-        return (koibnal_recvmsg (nal, private, msg, niov, iov, NULL,
-                                 offset, mlen, rlen));
+        return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+                                offset, mlen, rlen));
 }
 
 ptl_err_t
-koibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
                      unsigned int niov, ptl_kiov_t *kiov, 
                      size_t offset, size_t mlen, size_t rlen)
 {
-        return (koibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
-                                 offset, mlen, rlen));
+        return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+                                offset, mlen, rlen));
 }
 
 int
-koibnal_thread_start (int (*fn)(void *arg), void *arg)
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
 {
         long    pid = kernel_thread (fn, arg, 0);
 
         if (pid < 0)
                 return ((int)pid);
 
-        atomic_inc (&koibnal_data.koib_nthreads);
+        atomic_inc (&kibnal_data.kib_nthreads);
         return (0);
 }
 
 void
-koibnal_thread_fini (void)
+kibnal_thread_fini (void)
 {
-        atomic_dec (&koibnal_data.koib_nthreads);
+        atomic_dec (&kibnal_data.kib_nthreads);
 }
 
 void
-koibnal_close_conn_locked (koib_conn_t *conn, int error)
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
 {
         /* This just does the immmediate housekeeping, and schedules the
          * connection for the connd to finish off.
-         * Caller holds koib_global_lock exclusively in irq context */
-        koib_peer_t   *peer = conn->ibc_peer;
+         * Caller holds kib_global_lock exclusively in irq context */
+        kib_peer_t   *peer = conn->ibc_peer;
 
         CDEBUG (error == 0 ? D_NET : D_ERROR,
                 "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
         
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED ||
-                 conn->ibc_state == OPENIBNAL_CONN_CONNECTING);
+        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED ||
+                 conn->ibc_state == IBNAL_CONN_CONNECTING);
 
-        if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
-                /* koib_connd_conns takes ibc_list's ref */
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                /* kib_connd_conns takes ibc_list's ref */
                 list_del (&conn->ibc_list);
         } else {
-                /* new ref for koib_connd_conns */
+                /* new ref for kib_connd_conns */
                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                        atomic_read (&conn->ibc_refcount));
@@ -1555,57 +1562,57 @@ koibnal_close_conn_locked (koib_conn_t *conn, int error)
         if (list_empty (&peer->ibp_conns) &&
             peer->ibp_persistence == 0) {
                 /* Non-persistent peer with no more conns... */
-                koibnal_unlink_peer_locked (peer);
+                kibnal_unlink_peer_locked (peer);
         }
 
-        conn->ibc_state = OPENIBNAL_CONN_DEATHROW;
+        conn->ibc_state = IBNAL_CONN_DEATHROW;
 
         /* Schedule conn for closing/destruction */
-        spin_lock (&koibnal_data.koib_connd_lock);
+        spin_lock (&kibnal_data.kib_connd_lock);
 
-        list_add_tail (&conn->ibc_list, &koibnal_data.koib_connd_conns);
-        wake_up (&koibnal_data.koib_connd_waitq);
+        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
                 
-        spin_unlock (&koibnal_data.koib_connd_lock);
+        spin_unlock (&kibnal_data.kib_connd_lock);
 }
 
 int
-koibnal_close_conn (koib_conn_t *conn, int why)
+kibnal_close_conn (kib_conn_t *conn, int why)
 {
         unsigned long     flags;
         int               count = 0;
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        LASSERT (conn->ibc_state >= OPENIBNAL_CONN_CONNECTING);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
         
-        if (conn->ibc_state <= OPENIBNAL_CONN_ESTABLISHED) {
+        if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) {
                 count = 1;
-                koibnal_close_conn_locked (conn, why);
+                kibnal_close_conn_locked (conn, why);
         }
         
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
         return (count);
 }
 
 void
-koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc)
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
 {
         LIST_HEAD        (zombies);
-        koib_tx_t        *tx;
+        kib_tx_t         *tx;
         unsigned long     flags;
 
         LASSERT (rc != 0);
-        LASSERT (peer->ibp_reconnect_interval >= OPENIBNAL_MIN_RECONNECT_INTERVAL);
+        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
 
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         LASSERT (peer->ibp_connecting != 0);
         peer->ibp_connecting--;
 
         if (peer->ibp_connecting != 0) {
                 /* another connection attempt under way (loopback?)... */
-                write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
                 return;
         }
 
@@ -1614,50 +1621,50 @@ koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc)
                 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
                 /* Increase reconnection interval */
                 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
-                                                    OPENIBNAL_MAX_RECONNECT_INTERVAL);
+                                                    IBNAL_MAX_RECONNECT_INTERVAL);
         
                 /* Take peer's blocked blocked transmits; I'll complete
                  * them with error */
                 while (!list_empty (&peer->ibp_tx_queue)) {
                         tx = list_entry (peer->ibp_tx_queue.next,
-                                         koib_tx_t, tx_list);
+                                         kib_tx_t, tx_list);
                         
                         list_del (&tx->tx_list);
                         list_add_tail (&tx->tx_list, &zombies);
                 }
                 
-                if (koibnal_peer_active(peer) &&
+                if (kibnal_peer_active(peer) &&
                     (peer->ibp_persistence == 0)) {
                         /* failed connection attempt on non-persistent peer */
-                        koibnal_unlink_peer_locked (peer);
+                        kibnal_unlink_peer_locked (peer);
                 }
         } else {
                 /* Can't have blocked transmits if there are connections */
                 LASSERT (list_empty(&peer->ibp_tx_queue));
         }
         
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         if (!list_empty (&zombies))
                 CERROR ("Deleting messages for "LPX64": connection failed\n",
                         peer->ibp_nid);
 
         while (!list_empty (&zombies)) {
-                tx = list_entry (zombies.next, koib_tx_t, tx_list);
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
 
                 list_del (&tx->tx_list);
                 /* complete now */
                 tx->tx_status = -EHOSTUNREACH;
-                koibnal_tx_done (tx);
+                kibnal_tx_done (tx);
         }
 }
 
 void
-koibnal_connreq_done (koib_conn_t *conn, int active, int status)
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
 {
         int               state = conn->ibc_state;
-        koib_peer_t      *peer = conn->ibc_peer;
-        koib_tx_t        *tx;
+        kib_peer_t       *peer = conn->ibc_peer;
+        kib_tx_t         *tx;
         unsigned long     flags;
         int               rc;
         int               i;
@@ -1669,31 +1676,31 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                 conn->ibc_connreq = NULL;
         }
 
-        if (state == OPENIBNAL_CONN_CONNECTING) {
+        if (state == IBNAL_CONN_CONNECTING) {
                 /* Install common (active/passive) callback for
                  * disconnect/idle notification if I got as far as getting
                  * a CM comm_id */
                 rc = tsIbCmCallbackModify(conn->ibc_comm_id, 
-                                          koibnal_conn_callback, conn);
+                                          kibnal_conn_callback, conn);
                 LASSERT (rc == 0);
         }
         
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         LASSERT (peer->ibp_connecting != 0);
         
         if (status == 0) {                         
                 /* connection established... */
-                LASSERT (state == OPENIBNAL_CONN_CONNECTING);
-                conn->ibc_state = OPENIBNAL_CONN_ESTABLISHED;
+                LASSERT (state == IBNAL_CONN_CONNECTING);
+                conn->ibc_state = IBNAL_CONN_ESTABLISHED;
 
-                if (!koibnal_peer_active(peer)) {
+                if (!kibnal_peer_active(peer)) {
                         /* ...but peer deleted meantime */
                         status = -ECONNABORTED;
                 }
         } else {
-                LASSERT (state == OPENIBNAL_CONN_INIT_QP ||
-                         state == OPENIBNAL_CONN_CONNECTING);
+                LASSERT (state == IBNAL_CONN_INIT_QP ||
+                         state == IBNAL_CONN_CONNECTING);
         }
 
         if (status == 0) {
@@ -1710,14 +1717,14 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                 list_add (&conn->ibc_list, &peer->ibp_conns);
                 
                 /* reset reconnect interval for next attempt */
-                peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
+                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
 
                 /* post blocked sends to the new connection */
                 spin_lock (&conn->ibc_lock);
                 
                 while (!list_empty (&peer->ibp_tx_queue)) {
                         tx = list_entry (peer->ibp_tx_queue.next, 
-                                         koib_tx_t, tx_list);
+                                         kib_tx_t, tx_list);
                         
                         list_del (&tx->tx_list);
 
@@ -1726,19 +1733,19 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                                atomic_read (&conn->ibc_refcount));
                         atomic_inc (&conn->ibc_refcount);
-                        koibnal_queue_tx_locked (tx, conn);
+                        kibnal_queue_tx_locked (tx, conn);
                 }
                 
                 spin_unlock (&conn->ibc_lock);
 
                 /* Nuke any dangling conns from a different peer instance... */
-                koibnal_close_stale_conns_locked (conn->ibc_peer,
-                                                  conn->ibc_incarnation);
+                kibnal_close_stale_conns_locked (conn->ibc_peer,
+                                                 conn->ibc_incarnation);
 
-                write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
                 /* queue up all the receives */
-                for (i = 0; i < OPENIBNAL_RX_MSGS; i++) {
+                for (i = 0; i < IBNAL_RX_MSGS; i++) {
                         /* +1 ref for rx desc */
                         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
                                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
@@ -1749,71 +1756,71 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status)
                                i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
                                conn->ibc_rxs[i].rx_vaddr);
 
-                        koibnal_post_rx (&conn->ibc_rxs[i], 0);
+                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
                 }
 
-                koibnal_check_sends (conn);
+                kibnal_check_sends (conn);
                 return;
         }
 
         /* connection failed */
-        if (state == OPENIBNAL_CONN_CONNECTING) {
+        if (state == IBNAL_CONN_CONNECTING) {
                 /* schedule for connd to close */
-                koibnal_close_conn_locked (conn, status);
+                kibnal_close_conn_locked (conn, status);
         } else {
                 /* Don't have a CM comm_id; just wait for refs to drain */
-                conn->ibc_state = OPENIBNAL_CONN_ZOMBIE;
+                conn->ibc_state = IBNAL_CONN_ZOMBIE;
         } 
 
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
-        koibnal_peer_connect_failed (conn->ibc_peer, active, status);
+        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
 
-        if (state != OPENIBNAL_CONN_CONNECTING) {
+        if (state != IBNAL_CONN_CONNECTING) {
                 /* drop caller's ref if we're not waiting for the
                  * IB_CM_IDLE callback */
-                koibnal_put_conn (conn);
+                kibnal_put_conn (conn);
         }
 }
 
 int
-koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
+kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
                 ptl_nid_t nid, __u64 incarnation, int queue_depth)
 {
-        koib_conn_t   *conn = koibnal_create_conn();
-        koib_peer_t   *peer;
-        koib_peer_t   *peer2;
+        kib_conn_t    *conn = kibnal_create_conn();
+        kib_peer_t    *peer;
+        kib_peer_t    *peer2;
         unsigned long  flags;
 
         if (conn == NULL)
                 return (-ENOMEM);
 
-        if (queue_depth != OPENIBNAL_MSG_QUEUE_SIZE) {
+        if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
                 CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
-                       nid, queue_depth, OPENIBNAL_MSG_QUEUE_SIZE);
+                       nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
                 return (-EPROTO);
         }
         
         /* assume 'nid' is a new peer */
-        peer = koibnal_create_peer (nid);
+        peer = kibnal_create_peer (nid);
         if (peer == NULL) {
                 CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                        atomic_read (&conn->ibc_refcount));
                 atomic_dec (&conn->ibc_refcount);
-                koibnal_destroy_conn(conn);
+                kibnal_destroy_conn(conn);
                 return (-ENOMEM);
         }
         
-        write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        peer2 = koibnal_find_peer_locked(nid);
+        peer2 = kibnal_find_peer_locked(nid);
         if (peer2 == NULL) {
                 /* peer table takes my ref on peer */
                 list_add_tail (&peer->ibp_list,
-                               koibnal_nid2peerlist(nid));
+                               kibnal_nid2peerlist(nid));
         } else {
-                koibnal_put_peer (peer);
+                kibnal_put_peer (peer);
                 peer = peer2;
         }
 
@@ -1821,20 +1828,20 @@ koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
         atomic_inc (&peer->ibp_refcount);
         peer->ibp_connecting++;
 
-        write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         conn->ibc_peer = peer;
-        conn->ibc_state = OPENIBNAL_CONN_CONNECTING;
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
         conn->ibc_comm_id = cid;
         conn->ibc_incarnation = incarnation;
-        conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE;
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
 
         *connp = conn;
         return (0);
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
                             tTS_IB_CM_COMM_ID cid,
                             void *param,
                             void *arg)
@@ -1846,13 +1853,19 @@ koibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_conn_callback (tTS_IB_CM_EVENT event,
                        tTS_IB_CM_COMM_ID cid,
                        void *param,
                        void *arg)
 {
-        koib_conn_t *conn = arg;
-        int          rc;
+        kib_conn_t       *conn = arg;
+        LIST_HEAD        (zombies); 
+        struct list_head *tmp;
+        struct list_head *nxt;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               done;
+        int               rc;
 
         /* Established Connection Notifier */
 
@@ -1860,24 +1873,72 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event,
         default:
                 CERROR("Connection %p -> "LPX64" ERROR %d\n",
                        conn, conn->ibc_peer->ibp_nid, event);
-                koibnal_close_conn (conn, -ECONNABORTED);
+                kibnal_close_conn (conn, -ECONNABORTED);
                 break;
                 
         case TS_IB_CM_DISCONNECTED:
                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n",
                        conn, conn->ibc_peer->ibp_nid);
-                koibnal_close_conn (conn, 0);
+                kibnal_close_conn (conn, 0);
                 break;
 
         case TS_IB_CM_IDLE:
                 CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n",
                        conn, conn->ibc_peer->ibp_nid);
-                koibnal_put_conn (conn);        /* Lose CM's ref */
+                kibnal_put_conn (conn);        /* Lose CM's ref */
 
                 /* LASSERT (no further callbacks) */
                 rc = tsIbCmCallbackModify(cid, 
-                                          koibnal_idle_conn_callback, conn);
+                                          kibnal_idle_conn_callback, conn);
                 LASSERT (rc == 0);
+
+                /* NB we wait until the connection has closed before
+                 * completing outstanding passive RDMAs so we can be sure
+                 * the network can't touch the mapped memory any more. */
+
+                spin_lock_irqsave (&conn->ibc_lock, flags);
+
+                /* grab passive RDMAs not waiting for the tx callback */
+                list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+                        tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                        LASSERT (tx->tx_passive_rdma ||
+                                 !tx->tx_passive_rdma_wait);
+
+                        LASSERT (tx->tx_passive_rdma_wait ||
+                                 tx->tx_sending != 0);
+
+                        /* still waiting for tx callback? */
+                        if (!tx->tx_passive_rdma_wait)
+                                continue;
+
+                        tx->tx_status = -ECONNABORTED;
+                        tx->tx_passive_rdma_wait = 0;
+                        done = (tx->tx_sending == 0);
+
+                        if (!done)
+                                continue;
+
+                        list_del (&tx->tx_list);
+                        list_add (&tx->tx_list, &zombies);
+                }
+
+                /* grab all blocked transmits */
+                list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+                        tx = list_entry (tmp, kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+                        list_add (&tx->tx_list, &zombies);
+                }
+                
+                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+                while (!list_empty(&zombies)) {
+                        tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                        list_del(&tx->tx_list);
+                        kibnal_tx_done (tx);
+                }
                 break;
         }
 
@@ -1885,12 +1946,12 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event,
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                                tTS_IB_CM_COMM_ID cid,
                                void *param,
                                void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
         int          rc;
         
         switch (event) {
@@ -1903,12 +1964,12 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                 
                 CERROR ("Unexpected event %p -> "LPX64": %d\n", 
                         conn, conn->ibc_peer->ibp_nid, event);
-                koibnal_connreq_done (conn, 0, -ECONNABORTED);
+                kibnal_connreq_done (conn, 0, -ECONNABORTED);
                 break;
                 
         case TS_IB_CM_REQ_RECEIVED: {
                 struct ib_cm_req_received_param *req = param;
-                koib_wire_connreq_t             *wcr = req->remote_private_data;
+                kib_wire_connreq_t             *wcr = req->remote_private_data;
 
                 LASSERT (conn == NULL);
 
@@ -1920,23 +1981,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
 
-                if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) {
+                if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
                         CERROR ("Can't accept LID %04x: bad magic %08x\n",
                                 req->dlid, le32_to_cpu(wcr->wcr_magic));
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
                 
-                if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) {
+                if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
                         CERROR ("Can't accept LID %04x: bad version %d\n",
                                 req->dlid, le16_to_cpu(wcr->wcr_magic));
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
                                 
-                rc = koibnal_accept(&conn,
-                                    cid,
-                                    le64_to_cpu(wcr->wcr_nid),
-                                    le64_to_cpu(wcr->wcr_incarnation),
-                                    le16_to_cpu(wcr->wcr_queue_depth));
+                rc = kibnal_accept(&conn,
+                                   cid,
+                                   le64_to_cpu(wcr->wcr_nid),
+                                   le64_to_cpu(wcr->wcr_incarnation),
+                                   le16_to_cpu(wcr->wcr_queue_depth));
                 if (rc != 0) {
                         CERROR ("Can't accept "LPX64": %d\n",
                                 le64_to_cpu(wcr->wcr_nid), rc);
@@ -1945,23 +2006,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
 
                 /* update 'arg' for next callback */
                 rc = tsIbCmCallbackModify(cid, 
-                                          koibnal_passive_conn_callback, conn);
+                                          kibnal_passive_conn_callback, conn);
                 LASSERT (rc == 0);
 
                 req->accept_param.qp                     = conn->ibc_qp;
-                *((koib_wire_connreq_t *)req->accept_param.reply_private_data)
-                        = (koib_wire_connreq_t) {
-                                .wcr_magic       = cpu_to_le32(OPENIBNAL_MSG_MAGIC),
-                                .wcr_version     = cpu_to_le16(OPENIBNAL_MSG_VERSION),
-                                .wcr_queue_depth = cpu_to_le32(OPENIBNAL_MSG_QUEUE_SIZE),
-                                .wcr_nid         = cpu_to_le64(koibnal_data.koib_nid),
-                                .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation),
+                *((kib_wire_connreq_t *)req->accept_param.reply_private_data)
+                        = (kib_wire_connreq_t) {
+                                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                                .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+                                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
                         };
-                req->accept_param.reply_private_data_len = sizeof(koib_wire_connreq_t);
-                req->accept_param.responder_resources    = OPENIBNAL_RESPONDER_RESOURCES;
-                req->accept_param.initiator_depth        = OPENIBNAL_RESPONDER_RESOURCES;
-                req->accept_param.rnr_retry_count        = OPENIBNAL_RNR_RETRY;
-                req->accept_param.flow_control           = OPENIBNAL_FLOW_CONTROL;
+                req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t);
+                req->accept_param.responder_resources    = IBNAL_RESPONDER_RESOURCES;
+                req->accept_param.initiator_depth        = IBNAL_RESPONDER_RESOURCES;
+                req->accept_param.rnr_retry_count        = IBNAL_RNR_RETRY;
+                req->accept_param.flow_control           = IBNAL_FLOW_CONTROL;
 
                 CDEBUG(D_NET, "Proceeding\n");
                 break;
@@ -1972,60 +2033,60 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
                        conn, conn->ibc_peer->ibp_nid);
 
-                koibnal_connreq_done (conn, 0, 0);
+                kibnal_connreq_done (conn, 0, 0);
                 break;
         }
 
-        /* NB if the connreq is done, we switch to koibnal_conn_callback */
+        /* NB if the connreq is done, we switch to kibnal_conn_callback */
         return TS_IB_CM_CALLBACK_PROCEED;
 }
 
 tTS_IB_CM_CALLBACK_RETURN
-koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
                               tTS_IB_CM_COMM_ID cid,
                               void *param,
                               void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
 
         switch (event) {
         case TS_IB_CM_REP_RECEIVED: {
                 struct ib_cm_rep_received_param *rep = param;
-                koib_wire_connreq_t             *wcr = rep->remote_private_data;
+                kib_wire_connreq_t             *wcr = rep->remote_private_data;
 
                 if (rep->remote_private_data_len < sizeof (*wcr)) {
                         CERROR ("Short reply from "LPX64": %d\n",
                                 conn->ibc_peer->ibp_nid,
                                 rep->remote_private_data_len);
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
 
-                if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) {
+                if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
                         CERROR ("Can't connect "LPX64": bad magic %08x\n",
                                 conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
                 
-                if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) {
+                if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
                         CERROR ("Can't connect "LPX64": bad version %d\n",
                                 conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
                                 
-                if (wcr->wcr_queue_depth != cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE)) {
+                if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
                         CERROR ("Can't connect "LPX64": bad queue depth %d\n",
                                 conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth));
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
                                 
                 if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
                         CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
                                 le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
-                        koibnal_connreq_done (conn, 1, -EPROTO);
+                        kibnal_connreq_done (conn, 1, -EPROTO);
                         break;
                 }
 
@@ -2033,7 +2094,7 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
                        conn, conn->ibc_peer->ibp_nid);
 
                 conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
-                conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE;
+                conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
                 break;
         }
 
@@ -2041,86 +2102,86 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
                 CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n",
                        conn, conn->ibc_peer->ibp_nid);
 
-                koibnal_connreq_done (conn, 1, 0);
+                kibnal_connreq_done (conn, 1, 0);
                 break;
 
         case TS_IB_CM_IDLE:
                 CERROR("Connection %p -> "LPX64" IDLE\n",
                        conn, conn->ibc_peer->ibp_nid);
                 /* Back out state change: I'm disengaged from CM */
-                conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
                 
-                koibnal_connreq_done (conn, 1, -ECONNABORTED);
+                kibnal_connreq_done (conn, 1, -ECONNABORTED);
                 break;
 
         default:
                 CERROR("Connection %p -> "LPX64" ERROR %d\n",
                        conn, conn->ibc_peer->ibp_nid, event);
-                koibnal_connreq_done (conn, 1, -ECONNABORTED);
+                kibnal_connreq_done (conn, 1, -ECONNABORTED);
                 break;
         }
 
-        /* NB if the connreq is done, we switch to koibnal_conn_callback */
+        /* NB if the connreq is done, we switch to kibnal_conn_callback */
         return TS_IB_CM_CALLBACK_PROCEED;
 }
 
 int
-koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
+kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
                           struct ib_path_record *resp, int remaining,
                           void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
         
         if (status != 0) {
                 CERROR ("status %d\n", status);
-                koibnal_connreq_done (conn, 1, status);
+                kibnal_connreq_done (conn, 1, status);
                 goto out;
         }
 
         conn->ibc_connreq->cr_path = *resp;
 
-        conn->ibc_connreq->cr_wcr = (koib_wire_connreq_t) {
-                .wcr_magic       = cpu_to_le32(OPENIBNAL_MSG_MAGIC),
-                .wcr_version     = cpu_to_le16(OPENIBNAL_MSG_VERSION),
-                .wcr_queue_depth = cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE),
-                .wcr_nid         = cpu_to_le64(koibnal_data.koib_nid),
-                .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation),
+        conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
         };
 
         conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) {
                 .qp                   = conn->ibc_qp,
                 .req_private_data     = &conn->ibc_connreq->cr_wcr,
                 .req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr),
-                .responder_resources  = OPENIBNAL_RESPONDER_RESOURCES,
-                .initiator_depth      = OPENIBNAL_RESPONDER_RESOURCES,
-                .retry_count          = OPENIBNAL_RETRY,
-                .rnr_retry_count      = OPENIBNAL_RNR_RETRY,
-                .cm_response_timeout  = koibnal_tunables.koib_io_timeout,
-                .max_cm_retries       = OPENIBNAL_CM_RETRY,
-                .flow_control         = OPENIBNAL_FLOW_CONTROL,
+                .responder_resources  = IBNAL_RESPONDER_RESOURCES,
+                .initiator_depth      = IBNAL_RESPONDER_RESOURCES,
+                .retry_count          = IBNAL_RETRY,
+                .rnr_retry_count      = IBNAL_RNR_RETRY,
+                .cm_response_timeout  = kibnal_tunables.kib_io_timeout,
+                .max_cm_retries       = IBNAL_CM_RETRY,
+                .flow_control         = IBNAL_FLOW_CONTROL,
         };
 
         /* XXX set timeout just like SDP!!!*/
         conn->ibc_connreq->cr_path.packet_life = 13;
         
         /* Flag I'm getting involved with the CM... */
-        conn->ibc_state = OPENIBNAL_CONN_CONNECTING;
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
 
         CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
                conn->ibc_connreq->cr_service.service_id, 
-               *koibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
 
-        /* koibnal_connect_callback gets my conn ref */
+        /* kibnal_connect_callback gets my conn ref */
         status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, 
                                 &conn->ibc_connreq->cr_path, NULL,
                                 conn->ibc_connreq->cr_service.service_id, 0,
-                                koibnal_active_conn_callback, conn,
+                                kibnal_active_conn_callback, conn,
                                 &conn->ibc_comm_id);
         if (status != 0) {
                 CERROR ("Connect: %d\n", status);
                 /* Back out state change: I've not got a CM comm_id yet... */
-                conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
-                koibnal_connreq_done (conn, 1, status);
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
+                kibnal_connreq_done (conn, 1, status);
         }
         
  out:
@@ -2129,58 +2190,58 @@ koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
 }
 
 void
-koibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
-                              struct ib_common_attrib_service *resp, void *arg)
+kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
+                             struct ib_common_attrib_service *resp, void *arg)
 {
-        koib_conn_t *conn = arg;
+        kib_conn_t *conn = arg;
         
         if (status != 0) {
                 CERROR ("status %d\n", status);
-                koibnal_connreq_done (conn, 1, status);
+                kibnal_connreq_done (conn, 1, status);
                 return;
         }
 
         CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
                status, resp->service_id, 
-               *koibnal_service_nid_field(resp));
+               *kibnal_service_nid_field(resp));
 
         conn->ibc_connreq->cr_service = *resp;
 
-        status = ib_cached_gid_get(koibnal_data.koib_device,
-                                   koibnal_data.koib_port, 0,
+        status = ib_cached_gid_get(kibnal_data.kib_device,
+                                   kibnal_data.kib_port, 0,
                                    conn->ibc_connreq->cr_gid);
         LASSERT (status == 0);
 
-        /* koibnal_pathreq_callback gets my conn ref */
-        status = tsIbPathRecordRequest (koibnal_data.koib_device,
-                                        koibnal_data.koib_port,
+        /* kibnal_pathreq_callback gets my conn ref */
+        status = tsIbPathRecordRequest (kibnal_data.kib_device,
+                                        kibnal_data.kib_port,
                                         conn->ibc_connreq->cr_gid,
                                         conn->ibc_connreq->cr_service.service_gid,
                                         conn->ibc_connreq->cr_service.service_pkey,
                                         0,
-                                        koibnal_tunables.koib_io_timeout * HZ,
+                                        kibnal_tunables.kib_io_timeout * HZ,
                                         0,
-                                        koibnal_pathreq_callback, conn, 
+                                        kibnal_pathreq_callback, conn, 
                                         &conn->ibc_connreq->cr_tid);
 
         if (status == 0)
                 return;
 
         CERROR ("Path record request: %d\n", status);
-        koibnal_connreq_done (conn, 1, status);
+        kibnal_connreq_done (conn, 1, status);
 }
 
 void
-koibnal_connect_peer (koib_peer_t *peer)
+kibnal_connect_peer (kib_peer_t *peer)
 {
-        koib_conn_t *conn = koibnal_create_conn();
+        kib_conn_t  *conn = kibnal_create_conn();
         int          rc;
 
         LASSERT (peer->ibp_connecting != 0);
 
         if (conn == NULL) {
                 CERROR ("Can't allocate conn\n");
-                koibnal_peer_connect_failed (peer, 1, -ENOMEM);
+                kibnal_peer_connect_failed (peer, 1, -ENOMEM);
                 return;
         }
 
@@ -2190,85 +2251,101 @@ koibnal_connect_peer (koib_peer_t *peer)
         PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
         if (conn->ibc_connreq == NULL) {
                 CERROR ("Can't allocate connreq\n");
-                koibnal_connreq_done (conn, 1, -ENOMEM);
+                kibnal_connreq_done (conn, 1, -ENOMEM);
                 return;
         }
 
         memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
 
-        koibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+        kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
 
-        /* koibnal_service_get_callback gets my conn ref */
-        rc = ib_service_get (koibnal_data.koib_device, 
-                             koibnal_data.koib_port,
+        /* kibnal_service_get_callback gets my conn ref */
+        rc = ib_service_get (kibnal_data.kib_device, 
+                             kibnal_data.kib_port,
                              &conn->ibc_connreq->cr_service,
-                             KOIBNAL_SERVICE_KEY_MASK,
-                             koibnal_tunables.koib_io_timeout * HZ,
-                             koibnal_service_get_callback, conn, 
+                             KIBNAL_SERVICE_KEY_MASK,
+                             kibnal_tunables.kib_io_timeout * HZ,
+                             kibnal_service_get_callback, conn, 
                              &conn->ibc_connreq->cr_tid);
         
         if (rc == 0)
                 return;
 
         CERROR ("ib_service_get: %d\n", rc);
-        koibnal_connreq_done (conn, 1, rc);
+        kibnal_connreq_done (conn, 1, rc);
 }
 
 int
-koibnal_conn_timed_out (koib_conn_t *conn)
+kibnal_conn_timed_out (kib_conn_t *conn)
 {
-        koib_tx_t         *tx;
+        kib_tx_t          *tx;
         struct list_head  *ttmp;
         unsigned long      flags;
-        int                rc = 0;
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
-        list_for_each (ttmp, &conn->ibc_rdma_queue) {
-                tx = list_entry (ttmp, koib_tx_t, tx_list);
+        list_for_each (ttmp, &conn->ibc_tx_queue) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
 
-                LASSERT (tx->tx_passive_rdma);
-                LASSERT (tx->tx_passive_rdma_wait);
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_sending == 0);
 
-                if (time_after_eq (jiffies, tx->tx_passive_rdma_deadline)) {
-                        rc = 1;
-                        break;
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
                 }
         }
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
         spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
-        return rc;
+        return 0;
 }
 
 void
-koibnal_check_conns (int idx)
+kibnal_check_conns (int idx)
 {
-        struct list_head  *peers = &koibnal_data.koib_peers[idx];
+        struct list_head  *peers = &kibnal_data.kib_peers[idx];
         struct list_head  *ptmp;
-        koib_peer_t       *peer;
-        koib_conn_t       *conn;
+        kib_peer_t        *peer;
+        kib_conn_t        *conn;
         struct list_head  *ctmp;
 
  again:
         /* NB. We expect to have a look at all the peers and not find any
          * rdmas to time out, so we just use a shared lock while we
          * take a look... */
-        read_lock (&koibnal_data.koib_global_lock);
+        read_lock (&kibnal_data.kib_global_lock);
 
         list_for_each (ptmp, peers) {
-                peer = list_entry (ptmp, koib_peer_t, ibp_list);
+                peer = list_entry (ptmp, kib_peer_t, ibp_list);
 
                 list_for_each (ctmp, &peer->ibp_conns) {
-                        conn = list_entry (ctmp, koib_conn_t, ibc_list);
+                        conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
 
-                        LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED);
 
                         /* In case we have enough credits to return via a
                          * NOOP, but there were no non-blocking tx descs
                          * free to do it last time... */
-                        koibnal_check_sends(conn);
+                        kibnal_check_sends(conn);
 
-                        if (!koibnal_conn_timed_out(conn))
+                        if (!kibnal_conn_timed_out(conn))
                                 continue;
                         
                         CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
@@ -2276,108 +2353,76 @@ koibnal_check_conns (int idx)
                                atomic_read (&conn->ibc_refcount));
 
                         atomic_inc (&conn->ibc_refcount);
-                        read_unlock (&koibnal_data.koib_global_lock);
+                        read_unlock (&kibnal_data.kib_global_lock);
 
                         CERROR("Timed out RDMA with "LPX64"\n",
                                peer->ibp_nid);
 
-                        koibnal_close_conn (conn, -ETIMEDOUT);
-                        koibnal_put_conn (conn);
+                        kibnal_close_conn (conn, -ETIMEDOUT);
+                        kibnal_put_conn (conn);
 
                         /* start again now I've dropped the lock */
                         goto again;
                 }
         }
 
-        read_unlock (&koibnal_data.koib_global_lock);
+        read_unlock (&kibnal_data.kib_global_lock);
 }
 
 void
-koibnal_terminate_conn (koib_conn_t *conn)
+kibnal_terminate_conn (kib_conn_t *conn)
 {
-        unsigned long flags;
         int           rc;
-        int           done;
 
         CDEBUG(D_NET, "conn %p\n", conn);
-        LASSERT (conn->ibc_state == OPENIBNAL_CONN_DEATHROW);
-        conn->ibc_state = OPENIBNAL_CONN_ZOMBIE;
+        LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW);
+        conn->ibc_state = IBNAL_CONN_ZOMBIE;
 
         rc = ib_cm_disconnect (conn->ibc_comm_id);
         if (rc != 0)
                 CERROR ("Error %d disconnecting conn %p -> "LPX64"\n",
                         rc, conn, conn->ibc_peer->ibp_nid);
-
-        /* complete blocked passive RDMAs */
-        spin_lock_irqsave (&conn->ibc_lock, flags);
-        
-        while (!list_empty (&conn->ibc_rdma_queue)) {
-                koib_tx_t *tx = list_entry (conn->ibc_rdma_queue.next,
-                                            koib_tx_t, tx_list);
-
-                LASSERT (tx->tx_passive_rdma);
-                LASSERT (tx->tx_passive_rdma_wait);
-                
-                list_del (&tx->tx_list);
-
-                tx->tx_passive_rdma_wait = 0;
-                done = (tx->tx_sending == 0);
-                
-                tx->tx_status = -ECONNABORTED;
-
-                spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-                if (done)
-                        koibnal_tx_done (tx);
-
-                spin_lock_irqsave (&conn->ibc_lock, flags);
-        }
-        
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-        /* Complete all blocked transmits */
-        koibnal_check_sends(conn);
 }
 
 int
-koibnal_connd (void *arg)
+kibnal_connd (void *arg)
 {
         wait_queue_t       wait;
         unsigned long      flags;
-        koib_conn_t       *conn;
-        koib_peer_t       *peer;
+        kib_conn_t        *conn;
+        kib_peer_t        *peer;
         int                timeout;
         int                i;
         int                peer_index = 0;
         unsigned long      deadline = jiffies;
         
-        kportal_daemonize ("koibnal_connd");
+        kportal_daemonize ("kibnal_connd");
         kportal_blockallsigs ();
 
         init_waitqueue_entry (&wait, current);
 
-        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
 
         for (;;) {
-                if (!list_empty (&koibnal_data.koib_connd_conns)) {
-                        conn = list_entry (koibnal_data.koib_connd_conns.next,
-                                           koib_conn_t, ibc_list);
+                if (!list_empty (&kibnal_data.kib_connd_conns)) {
+                        conn = list_entry (kibnal_data.kib_connd_conns.next,
+                                           kib_conn_t, ibc_list);
                         list_del (&conn->ibc_list);
                         
-                        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                         switch (conn->ibc_state) {
-                        case OPENIBNAL_CONN_DEATHROW:
+                        case IBNAL_CONN_DEATHROW:
                                 LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID);
                                 /* Disconnect: conn becomes a zombie in the
                                  * callback and last ref reschedules it
                                  * here... */
-                                koibnal_terminate_conn(conn);
-                                koibnal_put_conn (conn);
+                                kibnal_terminate_conn(conn);
+                                kibnal_put_conn (conn);
                                 break;
                                 
-                        case OPENIBNAL_CONN_ZOMBIE:
-                                koibnal_destroy_conn (conn);
+                        case IBNAL_CONN_ZOMBIE:
+                                kibnal_destroy_conn (conn);
                                 break;
                                 
                         default:
@@ -2386,35 +2431,35 @@ koibnal_connd (void *arg)
                                 LBUG();
                         }
 
-                        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
                         continue;
                 }
 
-                if (!list_empty (&koibnal_data.koib_connd_peers)) {
-                        peer = list_entry (koibnal_data.koib_connd_peers.next,
-                                           koib_peer_t, ibp_connd_list);
+                if (!list_empty (&kibnal_data.kib_connd_peers)) {
+                        peer = list_entry (kibnal_data.kib_connd_peers.next,
+                                           kib_peer_t, ibp_connd_list);
                         
                         list_del_init (&peer->ibp_connd_list);
-                        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
-                        koibnal_connect_peer (peer);
-                        koibnal_put_peer (peer);
+                        kibnal_connect_peer (peer);
+                        kibnal_put_peer (peer);
 
-                        spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
                 }
 
                 /* shut down and nobody left to reap... */
-                if (koibnal_data.koib_shutdown &&
-                    atomic_read(&koibnal_data.koib_nconns) == 0)
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
                         break;
 
-                spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                 /* careful with the jiffy wrap... */
                 while ((timeout = (int)(deadline - jiffies)) <= 0) {
                         const int n = 4;
                         const int p = 1;
-                        int       chunk = koibnal_data.koib_peer_hash_size;
+                        int       chunk = kibnal_data.kib_peer_hash_size;
                         
                         /* Time to check for RDMA timeouts on a few more
                          * peers: I do checks every 'p' seconds on a
@@ -2424,129 +2469,129 @@ koibnal_connd (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-                        if (koibnal_tunables.koib_io_timeout > n * p)
+                        if (kibnal_tunables.kib_io_timeout > n * p)
                                 chunk = (chunk * n * p) / 
-                                        koibnal_tunables.koib_io_timeout;
+                                        kibnal_tunables.kib_io_timeout;
                         if (chunk == 0)
                                 chunk = 1;
 
                         for (i = 0; i < chunk; i++) {
-                                koibnal_check_conns (peer_index);
+                                kibnal_check_conns (peer_index);
                                 peer_index = (peer_index + 1) % 
-                                             koibnal_data.koib_peer_hash_size;
+                                             kibnal_data.kib_peer_hash_size;
                         }
 
                         deadline += p * HZ;
                 }
 
-                koibnal_data.koib_connd_waketime = jiffies + timeout;
+                kibnal_data.kib_connd_waketime = jiffies + timeout;
 
                 set_current_state (TASK_INTERRUPTIBLE);
-                add_wait_queue (&koibnal_data.koib_connd_waitq, &wait);
+                add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
 
-                if (!koibnal_data.koib_shutdown &&
-                    list_empty (&koibnal_data.koib_connd_conns) &&
-                    list_empty (&koibnal_data.koib_connd_peers))
+                if (!kibnal_data.kib_shutdown &&
+                    list_empty (&kibnal_data.kib_connd_conns) &&
+                    list_empty (&kibnal_data.kib_connd_peers))
                         schedule_timeout (timeout);
 
                 set_current_state (TASK_RUNNING);
-                remove_wait_queue (&koibnal_data.koib_connd_waitq, &wait);
+                remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
 
-                spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+                spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
         }
 
-        spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
-        koibnal_thread_fini ();
+        kibnal_thread_fini ();
         return (0);
 }
 
 int
-koibnal_scheduler(void *arg)
+kibnal_scheduler(void *arg)
 {
         long            id = (long)arg;
         char            name[16];
-        koib_rx_t      *rx;
-        koib_tx_t      *tx;
+        kib_rx_t       *rx;
+        kib_tx_t       *tx;
         unsigned long   flags;
         int             rc;
         int             counter = 0;
         int             did_something;
 
-        snprintf(name, sizeof(name), "koibnal_sd_%02ld", id);
+        snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
         kportal_daemonize(name);
         kportal_blockallsigs();
 
-        spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags);
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
 
         for (;;) {
                 did_something = 0;
 
-                while (!list_empty(&koibnal_data.koib_sched_txq)) {
-                        tx = list_entry(koibnal_data.koib_sched_txq.next,
-                                        koib_tx_t, tx_list);
+                while (!list_empty(&kibnal_data.kib_sched_txq)) {
+                        tx = list_entry(kibnal_data.kib_sched_txq.next,
+                                        kib_tx_t, tx_list);
                         list_del(&tx->tx_list);
-                        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
-                        koibnal_tx_done(tx);
+                        kibnal_tx_done(tx);
 
-                        spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
                                           flags);
                 }
 
-                if (!list_empty(&koibnal_data.koib_sched_rxq)) {
-                        rx = list_entry(koibnal_data.koib_sched_rxq.next,
-                                        koib_rx_t, rx_list);
+                if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+                        rx = list_entry(kibnal_data.kib_sched_rxq.next,
+                                        kib_rx_t, rx_list);
                         list_del(&rx->rx_list);
-                        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
 
-                        koibnal_rx(rx);
+                        kibnal_rx(rx);
 
                         did_something = 1;
-                        spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
                                           flags);
                 }
 
                 /* shut down and no receives to complete... */
-                if (koibnal_data.koib_shutdown &&
-                    atomic_read(&koibnal_data.koib_nconns) == 0)
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
                         break;
 
                 /* nothing to do or hogging CPU */
-                if (!did_something || counter++ == OPENIBNAL_RESCHED) {
-                        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+                if (!did_something || counter++ == IBNAL_RESCHED) {
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
                         counter = 0;
 
                         if (!did_something) {
                                 rc = wait_event_interruptible(
-                                        koibnal_data.koib_sched_waitq,
-                                        !list_empty(&koibnal_data.koib_sched_txq) || 
-                                        !list_empty(&koibnal_data.koib_sched_rxq) || 
-                                        (koibnal_data.koib_shutdown &&
-                                         atomic_read (&koibnal_data.koib_nconns) == 0));
+                                        kibnal_data.kib_sched_waitq,
+                                        !list_empty(&kibnal_data.kib_sched_txq) || 
+                                        !list_empty(&kibnal_data.kib_sched_rxq) || 
+                                        (kibnal_data.kib_shutdown &&
+                                         atomic_read (&kibnal_data.kib_nconns) == 0));
                         } else {
                                 our_cond_resched();
                         }
 
-                        spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
                                           flags);
                 }
         }
 
-        spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
 
-        koibnal_thread_fini();
+        kibnal_thread_fini();
         return (0);
 }
 
 
-lib_nal_t koibnal_lib = {
-        libnal_data:        &koibnal_data,      /* NAL private data */
-        libnal_send:         koibnal_send,
-        libnal_send_pages:   koibnal_send_pages,
-        libnal_recv:         koibnal_recv,
-        libnal_recv_pages:   koibnal_recv_pages,
-        libnal_dist:         koibnal_dist
+lib_nal_t kibnal_lib = {
+        libnal_data:        &kibnal_data,      /* NAL private data */
+        libnal_send:         kibnal_send,
+        libnal_send_pages:   kibnal_send_pages,
+        libnal_recv:         kibnal_recv,
+        libnal_recv_pages:   kibnal_recv_pages,
+        libnal_dist:         kibnal_dist
 };
index 16123c2..5aff4e9 100644 (file)
@@ -40,10 +40,10 @@ kpr_nal_interface_t kqswnal_router_interface = {
 #define QSWNAL_SYSCTL  201
 
 #define QSWNAL_SYSCTL_OPTIMIZED_GETS     1
-#define QSWNAL_SYSCTL_COPY_SMALL_FWD     2
+#define QSWNAL_SYSCTL_OPTIMIZED_PUTS     2
 
 static ctl_table kqswnal_ctl_table[] = {
-       {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_puts",
+       {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts",
         &kqswnal_tunables.kqn_optimized_puts, sizeof (int),
         0644, NULL, &proc_dointvec},
        {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets",
@@ -121,6 +121,8 @@ static void
 kqswnal_shutdown(nal_t *nal)
 {
        unsigned long flags;
+       kqswnal_tx_t *ktx;
+       kqswnal_rx_t *krx;
        int           do_lib_fini = 0;
 
        /* NB The first ref was this module! */
@@ -267,37 +269,25 @@ kqswnal_shutdown(nal_t *nal)
         * ep_dvma_release() get fixed (and releases any mappings in the
         * region), we can delete all the code from here -------->  */
 
-       if (kqswnal_data.kqn_txds != NULL) {
-               int  i;
+       for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) {
+               /* If ktx has a buffer, it got mapped; unmap now.  NB only
+                * the pre-mapped stuff is still mapped since all tx descs
+                * must be idle */
 
-               for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) {
-                       kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
-
-                       /* If ktx has a buffer, it got mapped; unmap now.
-                        * NB only the pre-mapped stuff is still mapped
-                        * since all tx descs must be idle */
-
-                       if (ktx->ktx_buffer != NULL)
-                               ep_dvma_unload(kqswnal_data.kqn_ep,
-                                              kqswnal_data.kqn_ep_tx_nmh,
-                                              &ktx->ktx_ebuffer);
-               }
+               if (ktx->ktx_buffer != NULL)
+                       ep_dvma_unload(kqswnal_data.kqn_ep,
+                                      kqswnal_data.kqn_ep_tx_nmh,
+                                      &ktx->ktx_ebuffer);
        }
 
-       if (kqswnal_data.kqn_rxds != NULL) {
-               int   i;
-
-               for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) {
-                       kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
-
-                       /* If krx_kiov[0].kiov_page got allocated, it got mapped.  
-                        * NB subsequent pages get merged */
+       for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
+               /* If krx_kiov[0].kiov_page got allocated, it got mapped.  
+                * NB subsequent pages get merged */
 
-                       if (krx->krx_kiov[0].kiov_page != NULL)
-                               ep_dvma_unload(kqswnal_data.kqn_ep,
-                                              kqswnal_data.kqn_ep_rx_nmh,
-                                              &krx->krx_elanbuffer);
-               }
+               if (krx->krx_kiov[0].kiov_page != NULL)
+                       ep_dvma_unload(kqswnal_data.kqn_ep,
+                                      kqswnal_data.kqn_ep_rx_nmh,
+                                      &krx->krx_elanbuffer);
        }
        /* <----------- to here */
 
@@ -330,41 +320,26 @@ kqswnal_shutdown(nal_t *nal)
        }
 #endif
 
-       if (kqswnal_data.kqn_txds != NULL)
-       {
-               int   i;
+       while (kqswnal_data.kqn_txds != NULL) {
+               ktx = kqswnal_data.kqn_txds;
 
-               for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++)
-               {
-                       kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
-
-                       if (ktx->ktx_buffer != NULL)
-                               PORTAL_FREE(ktx->ktx_buffer,
-                                           KQSW_TX_BUFFER_SIZE);
-               }
+               if (ktx->ktx_buffer != NULL)
+                       PORTAL_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
 
-               PORTAL_FREE(kqswnal_data.kqn_txds,
-                           sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS +
-                                                    KQSW_NNBLK_TXMSGS));
+               kqswnal_data.kqn_txds = ktx->ktx_alloclist;
+               PORTAL_FREE(ktx, sizeof(*ktx));
        }
 
-       if (kqswnal_data.kqn_rxds != NULL)
-       {
-               int   i;
-               int   j;
+       while (kqswnal_data.kqn_rxds != NULL) {
+               int           i;
 
-               for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
-               {
-                       kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+               krx = kqswnal_data.kqn_rxds;
+               for (i = 0; i < krx->krx_npages; i++)
+                       if (krx->krx_kiov[i].kiov_page != NULL)
+                               __free_page (krx->krx_kiov[i].kiov_page);
 
-                       for (j = 0; j < krx->krx_npages; j++)
-                               if (krx->krx_kiov[j].kiov_page != NULL)
-                                       __free_page (krx->krx_kiov[j].kiov_page);
-               }
-
-               PORTAL_FREE(kqswnal_data.kqn_rxds,
-                           sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL +
-                                                   KQSW_NRXMSGS_LARGE));
+               kqswnal_data.kqn_rxds = krx->krx_alloclist;
+               PORTAL_FREE(krx, sizeof (*krx));
        }
 
        /* resets flags, pointers to NULL etc */
@@ -388,6 +363,8 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
 #endif
        int               rc;
        int               i;
+       kqswnal_rx_t     *krx;
+       kqswnal_tx_t     *ktx;
        int               elan_page_idx;
        ptl_process_id_t  my_process_id;
        int               pkmem = atomic_read(&portal_kmemory);
@@ -560,23 +537,22 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
        /**********************************************************************/
        /* Allocate/Initialise transmit descriptors */
 
-       PORTAL_ALLOC(kqswnal_data.kqn_txds,
-                    sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
-       if (kqswnal_data.kqn_txds == NULL)
-       {
-               kqswnal_shutdown (nal);
-               return (PTL_NO_SPACE);
-       }
-
-       /* clear flags, null pointers etc */
-       memset(kqswnal_data.kqn_txds, 0,
-              sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       kqswnal_data.kqn_txds = NULL;
        for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++)
        {
                int           premapped_pages;
-               kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
                int           basepage = i * KQSW_NTXMSGPAGES;
 
+               PORTAL_ALLOC (ktx, sizeof(*ktx));
+               if (ktx == NULL) {
+                       kqswnal_shutdown (nal);
+                       return (PTL_NO_SPACE);
+               }
+
+               memset(ktx, 0, sizeof(*ktx));   /* NULL pointers; zero flags */
+               ktx->ktx_alloclist = kqswnal_data.kqn_txds;
+               kqswnal_data.kqn_txds = ktx;
+
                PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
                if (ktx->ktx_buffer == NULL)
                {
@@ -615,18 +591,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
 
        /**********************************************************************/
        /* Allocate/Initialise receive descriptors */
-
-       PORTAL_ALLOC (kqswnal_data.kqn_rxds,
-                     sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE));
-       if (kqswnal_data.kqn_rxds == NULL)
-       {
-               kqswnal_shutdown (nal);
-               return (PTL_NO_SPACE);
-       }
-
-       memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */
-              sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE));
-
+       kqswnal_data.kqn_rxds = NULL;
        elan_page_idx = 0;
        for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
        {
@@ -636,7 +601,16 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
                E3_Addr       elanbuffer;
 #endif
                int           j;
-               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               PORTAL_ALLOC(krx, sizeof(*krx));
+               if (krx == NULL) {
+                       kqswnal_shutdown(nal);
+                       return (PTL_NO_SPACE);
+               }
+
+               memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
+               krx->krx_alloclist = kqswnal_data.kqn_rxds;
+               kqswnal_data.kqn_rxds = krx;
 
                if (i < KQSW_NRXMSGS_SMALL)
                {
@@ -717,10 +691,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
        /**********************************************************************/
        /* Queue receives, now that it's OK to run their completion callbacks */
 
-       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
-       {
-               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
-
+       for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
                /* NB this enqueue can allocate/sleep (attr == 0) */
                krx->krx_state = KRX_POSTED;
 #if MULTIRAIL_EKC
index 438edc6..b08d710 100644 (file)
@@ -99,10 +99,10 @@ typedef unsigned long kqsw_csum_t;
 #define KQSW_TX_MAXCONTIG               (1<<10) /* largest payload that gets made contiguous on transmit */
 
 #define KQSW_NTXMSGS                    8       /* # normal transmit messages */
-#define KQSW_NNBLK_TXMSGS               256     /* # reserved transmit messages if can't block */
+#define KQSW_NNBLK_TXMSGS               512     /* # reserved transmit messages if can't block */
 
 #define KQSW_NRXMSGS_LARGE              64      /* # large receive buffers */
-#define KQSW_EP_ENVELOPES_LARGE         128     /* # large ep envelopes */
+#define KQSW_EP_ENVELOPES_LARGE         256     /* # large ep envelopes */
 
 #define KQSW_NRXMSGS_SMALL              256     /* # small receive buffers */
 #define KQSW_EP_ENVELOPES_SMALL         2048    /* # small ep envelopes */
@@ -144,9 +144,10 @@ typedef struct
 #endif
 } kqswnal_remotemd_t;
 
-typedef struct 
+typedef struct kqswnal_rx
 {
         struct list_head krx_list;              /* enqueue -> thread */
+        struct kqswnal_rx *krx_alloclist;       /* stack in kqn_rxds */
         EP_RCVR         *krx_eprx;              /* port to post receives to */
         EP_RXD          *krx_rxd;               /* receive descriptor (for repost) */
 #if MULTIRAIL_EKC
@@ -169,10 +170,11 @@ typedef struct
 #define KRX_COMPLETING   3                      /* waiting to be completed */
 
 
-typedef struct
+typedef struct kqswnal_tx
 {
         struct list_head  ktx_list;             /* enqueue idle/active */
         struct list_head  ktx_delayed_list;     /* enqueue delayedtxds */
+        struct kqswnal_tx *ktx_alloclist;       /* stack in kqn_txds */
         unsigned int      ktx_isnblk:1;         /* reserved descriptor? */
         unsigned int      ktx_state:7;          /* What I'm doing */
         unsigned int      ktx_firsttmpfrag:1;   /* ktx_frags[0] is in my ebuffer ? 0 : 1 */
@@ -222,8 +224,8 @@ typedef struct
         char               kqn_shuttingdown;    /* I'm trying to shut down */
         atomic_t           kqn_nthreads;        /* # threads running */
 
-        kqswnal_rx_t      *kqn_rxds;            /* all the receive descriptors */
-        kqswnal_tx_t      *kqn_txds;            /* all the transmit descriptors */
+        kqswnal_rx_t      *kqn_rxds;            /* stack of all the receive descriptors */
+        kqswnal_tx_t      *kqn_txds;            /* stack of all the transmit descriptors */
 
         struct list_head   kqn_idletxds;        /* transmit descriptors free to use */
         struct list_head   kqn_nblk_idletxds;   /* reserved free transmit descriptors */
index 75188e9..e77bd8e 100644 (file)
@@ -205,7 +205,7 @@ static int kscimacnal_startup(nal_t *nal, ptl_pid_t requested_pid,
         }
         kscimacnal_data.ksci_nid = (ptl_nid_t)(ntohl(mac_physaddr));
 
-        process_id.pid = requested_pid;
+        process_id.pid = 0;
         process_id.nid = kscimacnal_data.ksci_nid;
 
         CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n",
index 2a0ef11..7642770 100644 (file)
@@ -1226,9 +1226,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                             conn2->ksnc_type != conn->ksnc_type ||
                             conn2->ksnc_incarnation != incarnation)
                                 continue;
-                        
+
                         CWARN("Not creating duplicate connection to "
-                              "%u.%u.%u.%u type %d\n", 
+                              "%u.%u.%u.%u type %d\n",
                               HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type);
                         rc = -EALREADY;
                         goto failed_2;
@@ -1260,6 +1260,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                 break;
         }
 
+        /* Give conn a ref on sock->file since we're going to return success */
+        get_file(sock->file);
+
         conn->ksnc_peer = peer;                 /* conn takes my ref on peer */
         conn->ksnc_incarnation = incarnation;
         peer->ksnp_last_alive = jiffies;
@@ -1311,9 +1314,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                 ksocknal_putconnsock(conn);
         }
 
-        CWARN("New conn nid:"LPX64" [type:%d] %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+        CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d"
               " incarnation:"LPX64" sched[%d]/%d\n",
-              nid, conn->ksnc_type, HIPQUAD(conn->ksnc_myipaddr), 
+              nid, HIPQUAD(conn->ksnc_myipaddr), 
               HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation,
               (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
 
@@ -2054,8 +2057,7 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private)
                         rc = -EINVAL;
                         break;
                 }
-                if (rc != 0)
-                        fput (sock->file);
+                fput (sock->file);
                 break;
         }
         case NAL_CMD_CLOSE_CONNECTION: {
index 0a5266a..b8bbefd 100644 (file)
@@ -66,9 +66,7 @@
 #include <portals/lib-p30.h>
 #include <portals/nal.h>
 #include <portals/socknal.h>
-#include <linux/lustre_idl.h>
 
-#include <linux/lustre_idl.h>
 #define SOCKNAL_N_AUTOCONNECTD  4               /* # socknal autoconnect daemons */
 
 #define SOCKNAL_MIN_RECONNECT_INTERVAL HZ      /* first failed connection retry... */
index b22d501..762133e 100644 (file)
@@ -2324,17 +2324,34 @@ ksocknal_setup_sock (struct socket *sock)
         return (0);
 }
 
-int
-ksocknal_connect_peer (ksock_route_t *route, int type)
+static int
+ksocknal_connect_sock(struct socket **sockp, int *may_retry, 
+                      ksock_route_t *route, int local_port)
 {
-        struct sockaddr_in  ipaddr;
-        mm_segment_t        oldmm = get_fs();
-        struct timeval      tv;
-        int                 fd;
+        struct sockaddr_in  locaddr;
+        struct sockaddr_in  srvaddr;
         struct socket      *sock;
         int                 rc;
-        
+        int                 option;
+        mm_segment_t        oldmm = get_fs();
+        struct timeval      tv;
+
+        memset(&locaddr, 0, sizeof(locaddr)); 
+        locaddr.sin_family = AF_INET; 
+        locaddr.sin_port = htons(local_port);
+        locaddr.sin_addr.s_addr = 
+                (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) 
+                                            : INADDR_ANY;
+        memset (&srvaddr, 0, sizeof (srvaddr));
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons (route->ksnr_port);
+        srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
+
+        *may_retry = 0;
+
         rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+        *sockp = sock;
         if (rc != 0) {
                 CERROR ("Can't create autoconnect socket: %d\n", rc);
                 return (rc);
@@ -2344,17 +2361,23 @@ ksocknal_connect_peer (ksock_route_t *route, int type)
          * from userspace.  And we actually need the sock->file refcounting
          * that this gives you :) */
 
-        fd = sock_map_fd (sock);
-        if (fd < 0) {
+        rc = sock_map_fd (sock);
+        if (rc < 0) {
                 sock_release (sock);
-                CERROR ("sock_map_fd error %d\n", fd);
-                return (fd);
+                CERROR ("sock_map_fd error %d\n", rc);
+                return (rc);
         }
 
-        /* NB the fd now owns the ref on sock->file */
+        /* NB the file descriptor (rc) now owns the ref on sock->file */
         LASSERT (sock->file != NULL);
         LASSERT (file_count(sock->file) == 1);
 
+        get_file(sock->file);                /* extra ref makes sock->file */
+        sys_close(rc);                       /* survive this close */
+
+        /* Still got a single ref on sock->file */
+        LASSERT (file_count(sock->file) == 1);
+
         /* Set the socket timeouts, so our connection attempt completes in
          * finite time */
         tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
@@ -2367,7 +2390,7 @@ ksocknal_connect_peer (ksock_route_t *route, int type)
         if (rc != 0) {
                 CERROR ("Can't set send timeout %d: %d\n", 
                         ksocknal_tunables.ksnd_io_timeout, rc);
-                goto out;
+                goto failed;
         }
         
         set_fs (KERNEL_DS);
@@ -2377,53 +2400,83 @@ ksocknal_connect_peer (ksock_route_t *route, int type)
         if (rc != 0) {
                 CERROR ("Can't set receive timeout %d: %d\n",
                         ksocknal_tunables.ksnd_io_timeout, rc);
-                goto out;
+                goto failed;
         }
 
-        if (route->ksnr_myipaddr != 0) {
-                /* Bind to the local IP address */
-                memset (&ipaddr, 0, sizeof (ipaddr));
-                ipaddr.sin_family = AF_INET;
-                ipaddr.sin_port = htons (0); /* ANY */
-                ipaddr.sin_addr.s_addr = htonl(route->ksnr_myipaddr);
+        set_fs (KERNEL_DS);
+        option = 1;
+        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, 
+                             (char *)&option, sizeof (option)); 
+        set_fs (oldmm);
+        if (rc != 0) {
+                CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+                goto failed;
+        }
 
-                rc = sock->ops->bind (sock, (struct sockaddr *)&ipaddr,
-                                      sizeof (ipaddr));
-                if (rc != 0) {
-                        CERROR ("Can't bind to local IP %u.%u.%u.%u: %d\n",
-                                HIPQUAD(route->ksnr_myipaddr), rc);
-                        goto out;
-                }
+        rc = sock->ops->bind(sock, 
+                             (struct sockaddr *)&locaddr, sizeof(locaddr));
+        if (rc == -EADDRINUSE) {
+                CDEBUG(D_NET, "Port %d already in use\n", local_port);
+                *may_retry = 1;
+                goto failed;
         }
-        
-        memset (&ipaddr, 0, sizeof (ipaddr));
-        ipaddr.sin_family = AF_INET;
-        ipaddr.sin_port = htons (route->ksnr_port);
-        ipaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
-        
-        rc = sock->ops->connect (sock, (struct sockaddr *)&ipaddr, 
-                                 sizeof (ipaddr), sock->file->f_flags);
         if (rc != 0) {
-                CERROR ("Can't connect to nid "LPX64
-                        " local IP: %u.%u.%u.%u,"
-                        " remote IP: %u.%u.%u.%u/%d: %d\n", 
-                        route->ksnr_peer->ksnp_nid,
-                        HIPQUAD(route->ksnr_myipaddr),
-                        HIPQUAD(route->ksnr_ipaddr),
-                        route->ksnr_port, rc);
-                goto out;
+                CERROR("Error trying to bind to reserved port %d: %d\n",
+                       local_port, rc);
+                goto failed;
         }
 
-        rc = ksocknal_create_conn (route, sock, type);
-        if (rc == 0) {
-                /* Take an extra ref on sock->file to compensate for the
-                 * upcoming close which will lose fd's ref on it. */
-                get_file (sock->file);
+        rc = sock->ops->connect(sock,
+                                (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+                                sock->file->f_flags);
+        if (rc == 0)
+                return 0;
+
+        /* EADDRNOTAVAIL probably means we're already connected to the same
+         * peer/port on the same local port on a differently typed
+         * connection.  Let our caller retry with a different local
+         * port... */
+        *may_retry = (rc == -EADDRNOTAVAIL);
+
+        CDEBUG(*may_retry ? D_NET : D_ERROR,
+               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+               HIPQUAD(route->ksnr_myipaddr), local_port,
+               HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
+
+ failed:
+        fput(sock->file);
+        return rc;
+}
+
+int
+ksocknal_connect_peer (ksock_route_t *route, int type)
+{
+        struct socket      *sock;
+        int                 rc;
+        int                 port;
+        int                 may_retry;
+        
+        /* Iterate through reserved ports.  When typed connections are
+         * used, we will need to bind to multiple ports, but we only know
+         * this at connect time.  But, by that time we've already called
+         * bind() so we need a new socket. */
+
+        for (port = 1023; port > 512; --port) {
+
+                rc = ksocknal_connect_sock(&sock, &may_retry, route, port);
+
+                if (rc == 0) {
+                        rc = ksocknal_create_conn(route, sock, type);
+                        fput(sock->file);
+                        return rc;
+                }
+
+                if (!may_retry)
+                        return rc;
         }
 
- out:
-        sys_close (fd);
-        return (rc);
+        CERROR("Out of ports trying to bind to a reserved port\n");
+        return (-EADDRINUSE);
 }
 
 void
@@ -2443,7 +2496,6 @@ ksocknal_autoconnect (ksock_route_t *route)
                 LASSERT (type < SOCKNAL_CONN_NTYPES);
 
                 rc = ksocknal_connect_peer (route, type);
-
                 if (rc != 0)
                         break;
                 
index c56f76f..f571958 100644 (file)
@@ -60,7 +60,7 @@
 #endif
 
 unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL |
-                                            S_GMNAL | S_OPENIBNAL);
+                                            S_GMNAL | S_IBNAL);
 EXPORT_SYMBOL(portal_subsystem_debug);
 
 unsigned int portal_debug = (D_WARNING | D_DLMTRACE | D_ERROR | D_EMERG | D_HA |
@@ -97,6 +97,7 @@ int portals_do_debug_dumplog(void *arg)
 
         snprintf(debug_file_name, sizeof(debug_file_path) - 1,
                  "%s.%ld.%ld", debug_file_path, CURRENT_SECONDS, (long)arg);
+        printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name);
         tracefile_dump_all_pages(debug_file_name);
 
         current->journal_info = journal_info;
@@ -180,7 +181,7 @@ int portals_debug_clear_buffer(void)
 int portals_debug_mark_buffer(char *text)
 {
         CDEBUG(D_TRACE,"***************************************************\n");
-        CWARN("DEBUG MARKER: %s\n", text);
+        CDEBUG(D_WARNING, "DEBUG MARKER: %s\n", text);
         CDEBUG(D_TRACE,"***************************************************\n");
 
         return 0;
@@ -251,62 +252,46 @@ void portals_run_lbug_upcall(char *file, const char *fn, const int line)
 char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
 {
         if (nid == PTL_NID_ANY) {
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%s",
-                         "PTL_NID_ANY");
+                snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY");
                 return str;
         }
 
         switch(nal){
 /* XXX this could be a nal method of some sort, 'cept it's config
  * dependent whether (say) socknal NIDs are actually IP addresses... */
-#ifndef CRAY_PORTALS 
+#if !CRAY_PORTALS 
         case TCPNAL:
                 /* userspace NAL */
+        case IIBNAL:
         case OPENIBNAL:
         case SOCKNAL:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u",
+                snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u",
                          (__u32)(nid >> 32), HIPQUAD(nid));
                 break;
         case QSWNAL:
         case GMNAL:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u",
+                snprintf(str, PTL_NALFMT_SIZE, "%u:%u",
                          (__u32)(nid >> 32), (__u32)nid);
                 break;
 #endif
         default:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx",
+                snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx",
                          nal, (long long)nid);
                 break;
         }
         return str;
 }
-/*      bug #4615       */
+
 char *portals_id2str(int nal, ptl_process_id_t id, char *str)
 {
-        switch(nal){
-#ifndef CRAY_PORTALS
-        case TCPNAL:
-                /* userspace NAL */
-        case OPENIBNAL:
-        case SOCKNAL:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u,%u",
-                         (__u32)(id.nid >> 32), HIPQUAD((id.nid)) , id.pid);
-                break;
-        case QSWNAL:
-        case GMNAL:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u,%u",
-                         (__u32)(id.nid >> 32), (__u32)id.nid, id.pid);
-                break;
-#endif
-        default:
-                snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx,%lx",
-                         nal, (long long)id.nid, (long)id.pid );
-                break;
-        }
+        int   len;
+        
+        portals_nid2str(nal, id.nid, str);
+        len = strlen(str);
+        snprintf(str + len, PTL_NALFMT_SIZE, "-%u", id.pid);
         return str;
 }
 
-
 #ifdef __KERNEL__
 char stack_backtrace[LUSTRE_TRACE_SIZE];
 spinlock_t stack_backtrace_lock = SPIN_LOCK_UNLOCKED;
index 3703013..a2422e3 100644 (file)
@@ -327,6 +327,8 @@ libcfs_nal_cmd(struct portals_cfg *pcfg)
                 CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, 
                        pcfg->pcfg_command);
                 rc = cmd->nch_handler(pcfg, cmd->nch_private);
+        } else {
+                CERROR("invalid nal: %d, cmd: %d\n", nal, pcfg->pcfg_command);
         }
         up(&nal_cmd_sem);
 
@@ -413,15 +415,15 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
                 portals_debug_mark_buffer(data->ioc_inlbuf1);
                 RETURN(0);
 #if LWT_SUPPORT
-        case IOC_PORTAL_LWT_CONTROL: 
+        case IOC_PORTAL_LWT_CONTROL:
                 err = lwt_control (data->ioc_flags, data->ioc_misc);
                 break;
-                
+
         case IOC_PORTAL_LWT_SNAPSHOT: {
                 cycles_t   now;
                 int        ncpu;
                 int        total_size;
-                
+
                 err = lwt_snapshot (&now, &ncpu, &total_size,
                                     data->ioc_pbuf1, data->ioc_plen1);
                 data->ioc_nid = now;
@@ -429,15 +431,15 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
                 data->ioc_misc = total_size;
 
                 /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */
-                data->ioc_nid = sizeof(lwt_event_t);
-                data->ioc_nid2 = offsetof(lwt_event_t, lwte_where);
+                data->ioc_nid2 = sizeof(lwt_event_t);
+                data->ioc_nid3 = offsetof(lwt_event_t, lwte_where);
 
                 if (err == 0 &&
                     copy_to_user((char *)arg, data, sizeof (*data)))
                         err = -EFAULT;
                 break;
         }
-                
+
         case IOC_PORTAL_LWT_LOOKUP_STRING:
                 err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
                                          data->ioc_pbuf2, data->ioc_plen2);
@@ -456,7 +458,7 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
                         break;
                 }
 
-                if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, 
+                if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1,
                                    sizeof(pcfg))) {
                         err = -EFAULT;
                         break;
@@ -467,7 +469,7 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
                 err = libcfs_nal_cmd(&pcfg);
 
                 if (err == 0 &&
-                    copy_to_user((char *)data->ioc_pbuf1, &pcfg, 
+                    copy_to_user((char *)data->ioc_pbuf1, &pcfg,
                                  sizeof (pcfg)))
                         err = -EFAULT;
                 break;
index 562abcf..5759316 100644 (file)
@@ -38,7 +38,6 @@
 
 #include <linux/kp30.h>
 #include <linux/portals_compat25.h>
-#include <linux/lustre_compat25.h>
 #include <linux/libcfs.h>
 
 #define TCD_MAX_PAGES 1280
@@ -190,7 +189,7 @@ static void print_to_console(struct ptldebug_header *hdr, int mask, char *buf,
                 prefix = "Lustre";
                 ptype = KERN_INFO;
         }
-        
+
         printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid,
                hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
 }
@@ -455,7 +454,7 @@ int tracefile_dump_all_pages(char *filename)
         if (IS_ERR(filp)) {
                 rc = PTR_ERR(filp);
                 printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
-                      filename, rc);
+                       filename, rc);
                 goto out;
         }
 
@@ -773,6 +772,7 @@ int trace_write_debug_size(struct file *file, const char *buffer,
                        "(%lu).\n", max * smp_num_cpus, num_physpages / 5 * 4);
                 return count;
         }
+
         for (i = 0; i < NR_CPUS; i++) {
                 struct trace_cpu_data *tcd;
                 tcd = &trace_data[i].tcd;
index 13451d9..d584f1c 100644 (file)
@@ -83,7 +83,8 @@ lib_match_md(lib_nal_t *nal, int index, int op_mask,
                     me->match_id.nid != src_nid)
                         continue;
                 
-                CDEBUG(D_NET,"match_id.pid [%x], src_pid [%x]\n", me->match_id.pid, src_pid);
+                CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n",
+                       me->match_id.pid, src_pid);
 
                 if (me->match_id.pid != PTL_PID_ANY &&
                     me->match_id.pid != src_pid)
index eb41dfd..61ef372 100644 (file)
@@ -83,7 +83,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data,
 
                 CDEBUG (D_IOCTL, "Getting nid for nal [%d]\n", data->ioc_nal);
 
-                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih);
+                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
+                                NULL, &nih);
                 if (!(err == PTL_OK || err == PTL_IFACE_DUP))
                         RETURN (-EINVAL);
 
@@ -104,7 +105,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data,
                 CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
                         data->ioc_nal, data->ioc_nid, data->ioc_count);
 
-                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih);
+                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
+                                NULL, &nih);
                 if (!(err == PTL_OK || err == PTL_IFACE_DUP))
                         return (-EINVAL);
 
index 0fe3b90..a1397d2 100644 (file)
@@ -132,7 +132,7 @@ static int kpr_proc_routes_read(char *page, char **start, off_t off,
         *start = page + prd->skip;
         user_len = -prd->skip;
 
-        for (; prd->curr != &kpr_routes; prd->curr = prd->curr->next) {
+        while ((prd->curr != NULL) && (prd->curr != &kpr_routes)) {
                 re = list_entry(prd->curr, kpr_route_entry_t, kpre_list);
                 ge = re->kpre_gateway;
 
@@ -144,11 +144,20 @@ static int kpr_proc_routes_read(char *page, char **start, off_t off,
                 chunk_len += line_len;
                 user_len += line_len;
 
-                /* The route table will exceed one page */
-                if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count)) {
-                        prd->curr = prd->curr->next;
-                        break;
+                /* Abort the route list changed */
+                if (prd->curr->next == NULL) {
+                        prd->curr = NULL;
+                        read_unlock(&kpr_rwlock);
+                        return sprintf(page, "\nError: Routes Changed\n");
                 }
+
+                prd->curr = prd->curr->next;
+
+                /* The route table will exceed one page, break the while loop
+                 * so the function can be re-called with a new page.
+                 */
+                if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count))
+                        break;
         }
 
         *eof = 0;
index ed8dc08..b399fcf 100644 (file)
@@ -331,10 +331,17 @@ connection force_tcp_connection(manager m,
 {
     connection conn;
     struct sockaddr_in addr;
+    struct sockaddr_in locaddr; 
     unsigned int id[2];
     struct timeval tv;
     __u64 incarnation;
 
+    int fd;
+    int option;
+    int rc;
+    int rport;
+    ptl_nid_t peernid = PTL_NID_ANY;
+
     port = tcpnal_acceptor_port;
 
     id[0] = ip;
@@ -343,49 +350,82 @@ connection force_tcp_connection(manager m,
     pthread_mutex_lock(&m->conn_lock);
 
     conn = hash_table_find(m->connections, id);
-    if (!conn) {
-        int fd;
-        int option;
-        ptl_nid_t peernid = PTL_NID_ANY;
-
-        bzero((char *) &addr, sizeof(addr));
-        addr.sin_family      = AF_INET;
-        addr.sin_addr.s_addr = htonl(ip);
-        addr.sin_port        = htons(port);
-
-        if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 
-            perror("tcpnal socket failed");
-            exit(-1);
-        }
-        if (connect(fd, (struct sockaddr *)&addr,
-                    sizeof(struct sockaddr_in))) {
-            perror("tcpnal connect");
-            return(0);
-        }
+    if (conn)
+            goto out;
 
+    memset(&addr, 0, sizeof(addr));
+    addr.sin_family      = AF_INET;
+    addr.sin_addr.s_addr = htonl(ip);
+    addr.sin_port        = htons(port);
+
+    memset(&locaddr, 0, sizeof(locaddr)); 
+    locaddr.sin_family = AF_INET; 
+    locaddr.sin_addr.s_addr = INADDR_ANY;
+
+    for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
+            fd = socket(AF_INET, SOCK_STREAM, 0);
+            if (fd < 0) {
+                    perror("tcpnal socket failed");
+                    goto out;
+            } 
+            
+            option = 1;
+            rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 
+                            &option, sizeof(option));
+            if (rc != 0) {
+                    perror ("Can't set SO_REUSEADDR for socket"); 
+                    close(fd);
+                    goto out;
+            } 
+
+            locaddr.sin_port = htons(rport);
+            rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));
+            if (rc == 0 || errno == EACCES) {
+                    rc = connect(fd, (struct sockaddr *)&addr,
+                                 sizeof(struct sockaddr_in));
+                    if (rc == 0) {
+                            break;
+                    } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) {
+                            perror("Error connecting to remote host");
+                            close(fd);
+                            goto out;
+                    }
+            } else if (errno != EADDRINUSE) {
+                    perror("Error binding to privileged port");
+                    close(fd);
+                    goto out;
+            }
+            close(fd);
+    }
+    
+    if (rport == IPPORT_RESERVED / 2) {
+            fprintf(stderr, "Out of ports trying to bind to a reserved port\n");
+            goto out;
+    }
+    
 #if 1
-        option = 1;
-        setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
-        option = 1<<20;
-        setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
-        option = 1<<20;
-        setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
+    option = 1;
+    setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
+    option = 1<<20;
+    setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
+    option = 1<<20;
+    setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
 #endif
    
-        gettimeofday(&tv, NULL);
-        incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+    gettimeofday(&tv, NULL);
+    incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
 
-        /* say hello */
-        if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
+    /* say hello */
+    if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
             exit(-1);
+    
+    conn = allocate_connection(m, ip, port, fd);
+    
+    /* let nal thread know this event right away */
+    if (conn)
+            procbridge_wakeup_nal(pb);
 
-        conn = allocate_connection(m, ip, port, fd);
-
-        /* let nal thread know this event right away */
-        if (conn)
-                procbridge_wakeup_nal(pb);
-    }
-
+out:
     pthread_mutex_unlock(&m->conn_lock);
     return (conn);
 }
index 34dd070..a8f916d 100644 (file)
@@ -37,3 +37,10 @@ void remove_io_handler (io_handler i);
 void init_unix_timer(void);
 void select_timer_block(when until);
 when now(void);
+
+/*
+ * hacking for CFS internal MPI testing
+ */ 
+#if !CRAY_PORTALS
+#define ENABLE_SELECT_DISPATCH
+#endif
index f3843d7..6b471c0 100644 (file)
@@ -107,6 +107,10 @@ nal_t procapi_nal = {
 
 ptl_nid_t tcpnal_mynid;
 
+#ifdef ENABLE_SELECT_DISPATCH
+procbridge __global_procbridge = NULL;
+#endif
+
 /* Function: procbridge_startup
  *
  * Arguments:  pid: requested process id (port offset)
@@ -163,6 +167,10 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid,
         return PTL_FAIL;
     }
 
+#ifdef ENABLE_SELECT_DISPATCH
+    __global_procbridge = p;
+#endif
+
     /* create nal thread */
     if (pthread_create(&p->t, NULL, nal_thread, &args)) {
         perror("nal_init: pthread_create");
index c4ccae1..09e1542 100644 (file)
 #include <sys/time.h>
 #include <sys/types.h>
 #include <stdlib.h>
+#include <syscall.h>
+#include <pthread.h>
+#include <errno.h>
 #include <pqtimer.h>
 #include <dispatch.h>
+#include <procbridge.h>
 
 
 static struct timeval beginning_of_epoch;
@@ -95,40 +99,22 @@ void remove_io_handler (io_handler i)
     i->disabled=1;
 }
 
-static void set_flag(io_handler n,fd_set *fds)
+static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e)
 {
-    if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]);
-    if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]);
-    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]);
+    if (n->type & READ_HANDLER) FD_SET(n->fd, r);
+    if (n->type & WRITE_HANDLER) FD_SET(n->fd, w);
+    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e);
 }
 
-
-/* Function: select_timer_block
- * Arguments: until: an absolute time when the select should return
- * 
- *   This function dispatches the various file descriptors' handler
- *   functions, if the kernel indicates there is io available.
- */
-void select_timer_block(when until)
+static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e)
 {
-    fd_set fds[3];
-    struct timeval timeout;
-    struct timeval *timeout_pointer;
-    int result;
     io_handler j;
     io_handler *k;
+    int max = 0;
 
-    /* TODO: loop until the entire interval is expired*/
-    if (until){
-       when interval=until-now();
-        timeout.tv_sec=(interval>>32);
-        timeout.tv_usec=((interval<<32)/1000000)>>32;
-        timeout_pointer=&timeout;
-    } else timeout_pointer=0;
-
-    FD_ZERO(&fds[0]);
-    FD_ZERO(&fds[1]);
-    FD_ZERO(&fds[2]);
+    FD_ZERO(r);
+    FD_ZERO(w);
+    FD_ZERO(e);
     for (k=&io_handlers;*k;){
         if ((*k)->disabled){
             j=*k;
@@ -136,24 +122,291 @@ void select_timer_block(when until)
             free(j);
         }
         if (*k) {
-           set_flag(*k,fds);
+           set_flag(*k,r,w,e);
+            if ((*k)->fd > max)
+                max = (*k)->fd;
            k=&(*k)->next;
        }
     }
+    return max + 1;
+}
+
+static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e)
+{
+    io_handler j;
+    int n = 0, t;
+
+    for (j = io_handlers; j; j = j->next) {
+        if (j->disabled)
+            continue;
+
+        t = 0;
+        if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) {
+            FD_CLR(j->fd, r);
+            t++;
+        }
+        if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) {
+            FD_CLR(j->fd, w);
+            t++;
+        }
+        if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) {
+            FD_CLR(j->fd, e);
+            t++;
+        }
+        if (t == 0)
+            continue;
+
+        if (!(*j->function)(j->argument))
+            j->disabled = 1;
+
+        n += t;
+    }
+
+    return n;
+}
 
-    result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer);
+#ifdef ENABLE_SELECT_DISPATCH
 
-    if (result > 0)
-        for (j=io_handlers;j;j=j->next){
-            if (!(j->disabled) && 
-                ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) ||
-                 (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) ||
-                 (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){
-                if (!(*j->function)(j->argument))
-                    j->disabled=1;
+static struct {
+    pthread_mutex_t mutex;
+    pthread_cond_t  cond;
+    int             submitted;
+    int             nready;
+    int             maxfd;
+    fd_set         *rset;
+    fd_set         *wset;
+    fd_set         *eset;
+    struct timeval *timeout;
+    struct timeval  submit_time;
+} fd_extra = {
+    PTHREAD_MUTEX_INITIALIZER,
+    PTHREAD_COND_INITIALIZER,
+    0, 0, 0,
+    NULL, NULL, NULL, NULL,
+};
+
+extern int liblustre_wait_event(int timeout);
+extern procbridge __global_procbridge;
+
+/*
+ * this will intercept syscall select() of user apps
+ * such as MPI libs.
+ */
+int select(int n, fd_set *rset, fd_set *wset, fd_set *eset,
+           struct timeval *timeout)
+{
+    LASSERT(fd_extra.submitted == 0);
+
+    fd_extra.nready = 0;
+    fd_extra.maxfd = n;
+    fd_extra.rset = rset;
+    fd_extra.wset = wset;
+    fd_extra.eset = eset;
+    fd_extra.timeout = timeout;
+
+    liblustre_wait_event(0);
+    pthread_mutex_lock(&fd_extra.mutex);
+    gettimeofday(&fd_extra.submit_time, NULL);
+    fd_extra.submitted = 1;
+    LASSERT(__global_procbridge);
+    procbridge_wakeup_nal(__global_procbridge);
+
+again:
+    if (fd_extra.submitted)
+        pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex);
+    pthread_mutex_unlock(&fd_extra.mutex);
+
+    liblustre_wait_event(0);
+
+    pthread_mutex_lock(&fd_extra.mutex);
+    if (fd_extra.submitted)
+        goto again;
+    pthread_mutex_unlock(&fd_extra.mutex);
+
+    LASSERT(fd_extra.nready >= 0);
+    LASSERT(fd_extra.submitted == 0);
+    return fd_extra.nready;
+}
+
+static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset)
+{
+    int i;
+
+    LASSERT(rset);
+    LASSERT(wset);
+    LASSERT(eset);
+
+    for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) {
+        LASSERT(!fd_extra.rset ||
+                !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i]));
+        LASSERT(!fd_extra.wset ||
+                !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i]));
+        LASSERT(!fd_extra.eset ||
+                !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i]));
+
+        if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i])
+            __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i];
+        if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i])
+            __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i];
+        if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i])
+            __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i];
+    }
+
+    return (fd_extra.maxfd > max ? fd_extra.maxfd : max);
+}
+
+static inline
+int timeval_ge(struct timeval *tv1, struct timeval *tv2)
+{
+    LASSERT(tv1 && tv2);
+    return ((tv1->tv_sec - tv2->tv_sec) * 1000000 +
+            (tv1->tv_usec - tv2->tv_usec) >= 0);
+}
+
+/*
+ * choose the most recent timeout value
+ */
+static struct timeval *choose_timeout(struct timeval *tv1,
+                                      struct timeval *tv2)
+{
+    if (!tv1)
+        return tv2;
+    else if (!tv2)
+        return tv1;
+
+    if (timeval_ge(tv1, tv2))
+        return tv2;
+    else
+        return tv1;
+}
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer, *select_timeout;
+    int max, nready, nexec;
+    int fd_handling;
+
+again:
+    if (until) {
+        when interval;
+
+        interval = until - now();
+        timeout.tv_sec = (interval >> 32);
+        timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+        timeout_pointer = &timeout;
+    } else
+        timeout_pointer = NULL;
+
+    fd_handling = 0;
+    max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+    select_timeout = timeout_pointer;
+
+    pthread_mutex_lock(&fd_extra.mutex);
+    fd_handling = fd_extra.submitted;
+    pthread_mutex_unlock(&fd_extra.mutex);
+    if (fd_handling) {
+        max = merge_fds(max, &fds[0], &fds[1], &fds[2]);
+        select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout);
+    }
+
+    /* XXX only compile for linux */
+#if __WORDSIZE == 64
+    nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2],
+                     select_timeout);
+#else
+    nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2],
+                     select_timeout);
+#endif
+    if (nready < 0) {
+        CERROR("select return err %d, errno %d\n", nready, errno);
+        return;
+    }
+
+    if (nready) {
+        nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]);
+        nready -= nexec;
+    } else
+        nexec = 0;
+
+    /* even both nready & nexec are 0, we still need try to wakeup
+     * upper thread since it may have timed out
+     */
+    if (fd_handling) {
+        LASSERT(nready >= 0);
+
+        pthread_mutex_lock(&fd_extra.mutex);
+        if (nready) {
+            if (fd_extra.rset)
+                *fd_extra.rset = fds[0];
+            if (fd_extra.wset)
+                *fd_extra.wset = fds[1];
+            if (fd_extra.eset)
+                *fd_extra.eset = fds[2];
+            fd_extra.nready = nready;
+            fd_extra.submitted = 0;
+        } else {
+            struct timeval t;
+
+            fd_extra.nready = 0;
+            if (fd_extra.timeout) {
+                gettimeofday(&t, NULL);
+                if (timeval_ge(&t, &fd_extra.submit_time))
+                    fd_extra.submitted = 0;
             }
         }
+
+        pthread_cond_signal(&fd_extra.cond);
+        pthread_mutex_unlock(&fd_extra.mutex);
+    }
+
+    /* haven't found portals event, go back to loop if time
+     * is not expired */
+    if (!nexec) {
+        if (timeout_pointer == NULL || now() >= until)
+            goto again;
+    }
+}
+
+#else /* !ENABLE_SELECT_DISPATCH */
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer;
+    int max, nready;
+
+again:
+    if (until) {
+        when interval;
+        interval = until - now();
+        timeout.tv_sec = (interval >> 32);
+        timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+        timeout_pointer = &timeout;
+    } else
+        timeout_pointer = NULL;
+
+    max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+
+    nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer);
+    if (nready > 0)
+        execute_callbacks(&fds[0], &fds[1], &fds[2]);
 }
+#endif /* ENABLE_SELECT_DISPATCH */
 
 /* Function: init_unix_timer()
  *   is called to initialize the library 
index 6e9cca9..abb6d01 100644 (file)
@@ -251,8 +251,6 @@ int tcpnal_init(bridge b)
            newly created junk */
         return(PTL_NAL_FAILED);
     }
-    /* XXX cfs hack */
-//    b->lib_nal->libnal_ni.ni_pid.pid=0;
     b->lower=m;
     return(PTL_OK);
 }
index 8aea457..524d128 100644 (file)
@@ -89,7 +89,11 @@ show_connection (int fd, __u32 net_ip)
 void
 usage (char *myname)
 {
-        fprintf (stderr, "Usage: %s [-N nal_id] port\n", myname);
+        fprintf (stderr, 
+                 "Usage: %s [-N nal_id] [-p] [-l] port\n\n"
+                 " -l\tKeep stdin/stdout open\n"
+                 " -p\tAllow connections from non-privileged ports\n",
+                 myname);
         exit (1);
 }
 
@@ -100,24 +104,27 @@ int main(int argc, char **argv)
         int c;
         int noclose = 0;
         int nal = SOCKNAL;
+        int rport;
+        int require_privports = 1;
         
-        while ((c = getopt (argc, argv, "N:l")) != -1)
-                switch (c)
-                {
-                case 'l':
-                        noclose = 1;
-                        break;
-
+        while ((c = getopt (argc, argv, "N:lp")) != -1) {
+                switch (c) {
                 case 'N':
                         if (sscanf(optarg, "%d", &nal) != 1 ||
                             nal < 0 || nal > NAL_MAX_NR)
                                 usage(argv[0]);
                         break;
-                        
+                case 'l':
+                        noclose = 1;
+                        break;
+                case 'p':
+                        require_privports = 0;
+                        break;
                 default:
                         usage (argv[0]);
                         break;
                 }
+        }
 
         if (optind >= argc)
                 usage (argv[0]);
@@ -162,7 +169,7 @@ int main(int argc, char **argv)
                 exit(1);
         }
 
-        rc = daemon(1, noclose);
+        rc = daemon(0, noclose);
         if (rc < 0) {
                 perror("daemon(): ");
                 exit(1);
@@ -180,8 +187,8 @@ int main(int argc, char **argv)
                 struct portals_cfg pcfg;
 #ifdef HAVE_LIBWRAP
                 struct request_info request;
-                char addrstr[INET_ADDRSTRLEN];
 #endif
+                char addrstr[INET_ADDRSTRLEN];
                
                 cfd = accept(fd, (struct sockaddr *)&clntaddr, &len);
                 if ( cfd < 0 ) {
@@ -203,6 +210,18 @@ int main(int argc, char **argv)
                         continue;
                 }
 #endif
+
+                if (require_privports && ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) {
+                        inet_ntop(AF_INET, &clntaddr.sin_addr,
+                                  addrstr, INET_ADDRSTRLEN);
+                        syslog(LOG_ERR, "Closing non-privileged connection from %s:%d\n",
+                               addrstr, ntohs(clntaddr.sin_port));
+                        rc = close(cfd);
+                        if (rc)
+                                perror ("close un-privileged client failed");
+                        continue;
+                }
+
                 show_connection (cfd, clntaddr.sin_addr.s_addr);
 
                 PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD);
index 36d8a04..5b65f24 100644 (file)
 #include <portals/list.h>
 
 #include <stdio.h>
+#ifdef HAVE_NETDB_H
 #include <netdb.h>
+#endif
 #include <stdlib.h>
 #include <string.h>
+#include "ioctl.h"
 #include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 
+#ifdef HAVE_LINUX_VERSION_H
 #include <linux/version.h>
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #define BUG()                            /* workaround for module.h includes */
 #include <linux/module.h>
 #endif
+#endif /* !HAVE_LINUX_VERSION_H */
+
 #include <sys/utsname.h>
 
 #include <portals/api-support.h>
@@ -62,7 +68,7 @@
 static char rawbuf[8192];
 static char *buf = rawbuf;
 static int max = 8192;
-//static int g_pfd = -1;
+/*static int g_pfd = -1;*/
 static int subsystem_mask = ~0;
 static int debug_mask = ~0;
 
@@ -72,7 +78,7 @@ static const char *portal_debug_subsystems[] =
         {"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite",
          "rpc", "mgmt", "portals", "libcfs", "socknal", "qswnal", "pinger",
          "filter", "ptlbd", "echo", "ldlm", "lov", "gmnal", "router", "cobd",
-         "openibnal", "lmv", "smfs", "cmobd", NULL};
+         "ibnal", NULL};
 static const char *portal_debug_masks[] =
         {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
          "blocks", "net", "warning", "buffs", "other", "dentry", "portals",
@@ -371,15 +377,24 @@ int jt_dbg_debug_kernel(int argc, char **argv)
                 fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]);
                 return 0;
         }
-        sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : "/tmp/lustre-log",
-                time(NULL), getpid());
 
-        if (argc > 2)
+        if (argc > 2) {
                 raw = atoi(argv[2]);
+        } else if (argc > 1 && (argv[1][0] == '0' || argv[1][0] == '1')) {
+                raw = atoi(argv[1]);
+                argc--;
+        } else {
+                sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] :
+                        "/tmp/lustre-log", time(NULL), getpid());
+        }
+
         unlink(filename);
 
         fd = open("/proc/sys/portals/dump_kernel", O_WRONLY);
         if (fd < 0) {
+                if (errno == ENOENT) /* no dump file created */
+                        return 0;
+
                 fprintf(stderr, "open(dump_kernel) failed: %s\n",
                         strerror(errno));
                 return 1;
@@ -477,25 +492,25 @@ const char debug_daemon_usage[]="usage: debug_daemon {start file [MB]|stop}\n";
 int jt_dbg_debug_daemon(int argc, char **argv)
 {
         int rc, fd;
-                                                                                                                                                                                                     
+
         if (argc <= 1) {
                 fprintf(stderr, debug_daemon_usage);
                 return 0;
         }
-                                                                                                                                                                                                     
+
         fd = open("/proc/sys/portals/daemon_file", O_WRONLY);
         if (fd < 0) {
                 fprintf(stderr, "open(daemon_file) failed: %s\n",
                         strerror(errno));
                 return 1;
         }
-                                                                                                                                                                                                     
+
         if (strcasecmp(argv[1], "start") == 0) {
                 if (argc != 3) {
                         fprintf(stderr, debug_daemon_usage);
                         return 1;
                 }
-                                                                                                                                                                                                     
+
                 rc = write(fd, argv[2], strlen(argv[2]));
                 if (rc != strlen(argv[2])) {
                         fprintf(stderr, "write(%s) failed: %s\n", argv[2],
@@ -515,7 +530,7 @@ int jt_dbg_debug_daemon(int argc, char **argv)
                 fprintf(stderr, debug_daemon_usage);
                 return 1;
         }
-                                                                                                                                                                                                     
+
         close(fd);
         return 0;
 }
@@ -611,7 +626,6 @@ static struct mod_paths {
         {"obdfilter", "lustre/obdfilter"},
         {"extN", "lustre/extN"},
         {"lov", "lustre/lov"},
-        {"lmv", "lustre/lmv"},
         {"fsfilt_ext3", "lustre/lvfs"},
         {"fsfilt_extN", "lustre/lvfs"},
         {"fsfilt_reiserfs", "lustre/lvfs"},
@@ -623,13 +637,13 @@ static struct mod_paths {
         {"ptlbd", "lustre/ptlbd"},
         {"mgmt_svc", "lustre/mgmt"},
         {"mgmt_cli", "lustre/mgmt"},
-        {"cobd", "lustre/cobd"},
-        {"cmobd", "lustre/cmobd"},
+        {"conf_obd", "lustre/obdclass"},
         {NULL, NULL}
 };
 
 static int jt_dbg_modules_2_4(int argc, char **argv)
 {
+#ifdef HAVE_LINUX_VERSION_H
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         struct mod_paths *mp;
         char *path = "..";
@@ -665,9 +679,9 @@ static int jt_dbg_modules_2_4(int argc, char **argv)
         }
 
         return 0;
-#else /* Headers are 2.6-only */
+#endif /* Headers are 2.6-only */
+#endif /* !HAVE_LINUX_VERSION_H */
         return -EINVAL;
-#endif
 }
 
 static int jt_dbg_modules_2_5(int argc, char **argv)
index 1bde59f..d5d29dc 100644 (file)
 
 #include <stdio.h>
 #include <sys/types.h>
+#ifdef HAVE_NETDB_H
 #include <netdb.h>
+#endif
 #include <sys/socket.h>
+#ifdef HAVE_NETINET_TCP_H
 #include <netinet/tcp.h>
-#include <netdb.h>
+#endif
 #include <stdlib.h>
 #include <string.h>
 #include <fcntl.h>
+#include "ioctl.h"
 #include <sys/ioctl.h>
 #include <errno.h>
 #include <unistd.h>
@@ -54,10 +58,6 @@ unsigned int portal_printk;
 
 static unsigned int g_nal = 0;
 
-static int g_socket_txmem = 0;
-static int g_socket_rxmem = 0;
-static int g_socket_nonagle = 1;
-
 typedef struct
 {
         char *name;
@@ -70,6 +70,7 @@ static name2num_t nalnames[] = {
         {"elan",       QSWNAL},
         {"gm",         GMNAL},
         {"openib",      OPENIBNAL},
+        {"iib",         IIBNAL},
         {NULL,         -1}
 };
 
@@ -209,6 +210,7 @@ nal2name (int nal)
         return ((e == NULL) ? "???" : e->name);
 }
 
+#ifdef HAVE_GETHOSTBYNAME
 static struct hostent *
 ptl_gethostbyname(char * hname) {
         struct hostent *he;
@@ -229,6 +231,7 @@ ptl_gethostbyname(char * hname) {
         }
         return he;
 }
+#endif
 
 int
 ptl_parse_port (int *port, char *str)
@@ -295,7 +298,9 @@ ptl_parse_ipquad (__u32 *ipaddrp, char *str)
 int
 ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
 {
+#ifdef HAVE_GETHOSTBYNAME
         struct hostent *he;
+#endif
 
         if (!strcmp (str, "_all_")) 
         {
@@ -305,7 +310,8 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
 
         if (ptl_parse_ipquad(ipaddrp, str) == 0)
                 return (0);
-        
+
+#if HAVE_GETHOSTBYNAME        
         if ((('a' <= str[0] && str[0] <= 'z') ||
              ('A' <= str[0] && str[0] <= 'Z')) &&
              (he = ptl_gethostbyname (str)) != NULL)
@@ -315,6 +321,7 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
                 *ipaddrp = ntohl(addr);         /* HOST byte order */
                 return (0);
         }
+#endif
 
         return (-1);
 }
@@ -322,6 +329,7 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
 char *
 ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup)
 {
+#ifdef HAVE_GETHOSTBYNAME
         __u32           net_ip;
         struct hostent *he;
 
@@ -333,7 +341,8 @@ ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup)
                         return (str);
                 }
         }
-        
+#endif
+
         sprintf (str, "%d.%d.%d.%d",
                  (ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff,
                  (ipaddr >> 8) & 0xff, ipaddr & 0xff);
@@ -386,6 +395,7 @@ char *
 ptl_nid2str (char *buffer, ptl_nid_t nid)
 {
         __u64           nid64 = ptl_nid2u64(nid);
+#ifdef HAVE_GETHOSTBYNAME
         struct hostent *he = 0;
 
         /* Don't try to resolve NIDs that are e.g. Elan host IDs.  Assume
@@ -400,6 +410,7 @@ ptl_nid2str (char *buffer, ptl_nid_t nid)
         if (he != NULL)
                 sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name);
         else
+#endif /* HAVE_GETHOSTBYNAME */
                 sprintf(buffer, LPX64, nid64);
 
         return (buffer);
@@ -524,7 +535,6 @@ int jt_ptl_network(int argc, char **argv)
         return (-1);
 }
 
-
 int
 jt_ptl_print_interfaces (int argc, char **argv)
 {
@@ -563,6 +573,9 @@ jt_ptl_add_interface (int argc, char **argv)
         __u32                    ipaddr;
         int                      rc;
         __u32                    netmask = 0xffffff00;
+        int                      i;
+        int                      count;
+        char                    *end;
 
         if (argc < 2 || argc > 3) {
                 fprintf (stderr, "usage: %s ipaddr [netmask]\n", argv[0]);
@@ -576,13 +589,19 @@ jt_ptl_add_interface (int argc, char **argv)
                 fprintf (stderr, "Can't parse ip: %s\n", argv[1]);
                 return -1;
         }
-        
-        if (argc > 2 &&
-            ptl_parse_ipquad(&netmask, argv[2]) != 0) {
-                fprintf (stderr, "Can't parse netmask: %s\n", argv[2]);
-                return -1;
+
+        if (argc > 2 ) {
+                count = strtol(argv[2], &end, 0);
+                if (count > 0 && count < 32 && *end == 0) {
+                        netmask = 0;
+                        for (i = count; i > 0; i--)
+                                netmask = netmask|(1<<(32-i));
+                } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) {
+                        fprintf (stderr, "Can't parse netmask: %s\n", argv[2]);
+                        return -1;
+                }
         }
-        
+
         PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE);
         pcfg.pcfg_id     = ipaddr;
         pcfg.pcfg_misc   = netmask;
@@ -593,7 +612,7 @@ jt_ptl_add_interface (int argc, char **argv)
                          strerror (errno));
                 return -1;
         }
-        
+
         return 0;
 }
 
@@ -627,11 +646,11 @@ jt_ptl_del_interface (int argc, char **argv)
                          strerror (errno));
                 return -1;
         }
-        
+
         return 0;
 }
 
-int 
+int
 jt_ptl_print_peers (int argc, char **argv)
 {
         struct portals_cfg       pcfg;
@@ -639,7 +658,7 @@ jt_ptl_print_peers (int argc, char **argv)
         int                      index;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return -1;
 
         for (index = 0;;index++) {
@@ -675,7 +694,7 @@ jt_ptl_add_peer (int argc, char **argv)
         int                      port = 0;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return -1;
 
         if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
@@ -685,7 +704,7 @@ jt_ptl_add_peer (int argc, char **argv)
                         return 0;
                 }
         } else if (argc != 2) {
-                fprintf (stderr, "usage(openib): %s nid\n", argv[0]);
+                fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]);
                 return 0;
         }
 
@@ -732,7 +751,7 @@ jt_ptl_del_peer (int argc, char **argv)
         int                      argidx;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return -1;
 
         if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
@@ -764,7 +783,7 @@ jt_ptl_del_peer (int argc, char **argv)
         }
         
         if (argc > argidx) {
-                if (!strcmp (argv[3], "single_share")) {
+                if (!strcmp (argv[argidx], "single_share")) {
                         single_share = 1;
                 } else {
                         fprintf (stderr, "Unrecognised arg %s'\n", argv[3]);
@@ -795,7 +814,7 @@ jt_ptl_print_connections (int argc, char **argv)
         int                      index;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return -1;
 
         for (index = 0;;index++) {
@@ -832,13 +851,19 @@ jt_ptl_print_connections (int argc, char **argv)
 
 int jt_ptl_connect(int argc, char **argv)
 {
+#ifndef HAVE_CONNECT
+        /* no connect() support */
+        return -1;
+#else /* HAVE_CONNECT */
         struct portals_cfg pcfg;
         struct sockaddr_in srvaddr;
+        struct sockaddr_in locaddr;
         __u32 ipaddr;
         char *flag;
         int fd, rc;
         int type = SOCKNAL_CONN_ANY;
-        int port;
+        int port, rport;
+        int o;
 
         if (argc < 3) {
                 fprintf(stderr, "usage: %s ip port [type]\n", argv[0]);
@@ -893,20 +918,48 @@ int jt_ptl_connect(int argc, char **argv)
                                 return (-1);
                         }
 
+        memset(&locaddr, 0, sizeof(locaddr)); 
+        locaddr.sin_family = AF_INET; 
+        locaddr.sin_addr.s_addr = INADDR_ANY;
+
         memset(&srvaddr, 0, sizeof(srvaddr));
         srvaddr.sin_family = AF_INET;
         srvaddr.sin_port = htons(port);
         srvaddr.sin_addr.s_addr = htonl(ipaddr);
 
-        fd = socket(PF_INET, SOCK_STREAM, 0);
-        if ( fd < 0 ) {
-                fprintf(stderr, "socket() failed: %s\n", strerror(errno));
-                return -1;
+
+        for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
+                fd = socket(PF_INET, SOCK_STREAM, 0); 
+                if ( fd < 0 ) { 
+                        fprintf(stderr, "socket() failed: %s\n", strerror(errno)); 
+                        return -1; 
+                }
+
+                o = 1;
+                rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 
+                                &o, sizeof(o));
+                
+                locaddr.sin_port = htons(rport);
+                rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); 
+                if (rc == 0 || errno == EACCES) {
+                        rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+                        if (rc == 0) {
+                                break;
+                        } else if (errno != EADDRINUSE) {
+                                fprintf(stderr, "Error connecting to host: %s\n", strerror(errno));
+                                close(fd);
+                                return -1;
+                        }
+                } else if (errno != EADDRINUSE) {
+                        fprintf(stderr, "Error binding to port %d: %d: %s\n", port, errno, strerror(errno));
+                        close(fd);
+                        return -1;
+                }
         }
 
-        rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
-        if ( rc == -1 ) { 
-                fprintf(stderr, "connect() failed: %s\n", strerror(errno));
+        if (rport == IPPORT_RESERVED / 2) {
+                fprintf(stderr,
+                        "Warning: all privileged ports are in use.\n"); 
                 return -1;
         }
 
@@ -937,6 +990,7 @@ int jt_ptl_connect(int argc, char **argv)
                 fprintf(stderr, "close failed: %d\n", rc);
 
         return 0;
+#endif /* HAVE_CONNECT */
 }
 
 int jt_ptl_disconnect(int argc, char **argv)
@@ -951,7 +1005,7 @@ int jt_ptl_disconnect(int argc, char **argv)
                 return 0;
         }
 
-        if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, 0))
+        if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, 0))
                 return 0;
 
         if (argc >= 2 &&
@@ -1491,11 +1545,11 @@ lwt_snapshot(cycles_t *now, int *ncpu, int *totalsize,
         }
 
         /* crappy overloads */
-        if (data.ioc_nid != sizeof(lwt_event_t) ||
-            data.ioc_nid2 != offsetof(lwt_event_t, lwte_where)) {
+        if (data.ioc_nid2 != sizeof(lwt_event_t) ||
+            data.ioc_nid3 != offsetof(lwt_event_t, lwte_where)) {
                 fprintf(stderr,"kernel/user LWT event mismatch %d(%d),%d(%d)\n",
-                        (int)data.ioc_nid, sizeof(lwt_event_t),
-                        (int)data.ioc_nid2,
+                        (int)data.ioc_nid2, sizeof(lwt_event_t),
+                        (int)data.ioc_nid3,
                         (int)offsetof(lwt_event_t, lwte_where));
                 return (-1);
         }
@@ -1573,12 +1627,21 @@ lwt_put_string(char *ustr)
 static int
 lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e)
 {
+#ifndef __WORDSIZE
+# error "__WORDSIZE not defined"
+#elif __WORDSIZE == 32
+# define XFMT "%#010lx"
+#elif __WORDSIZE== 64
+# define XFMT "%#018lx"
+#else
+# error "Unexpected __WORDSIZE"
+#endif
         char           *where = lwt_get_string(e->lwte_where);
 
         if (where == NULL)
                 return (-1);
 
-        fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n",
+        fprintf(f, XFMT" "XFMT" "XFMT" "XFMT": "XFMT" %2d %10.6f %10.2f %s\n",
                 e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4,
                 (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0),
                 (t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz,
@@ -1587,6 +1650,7 @@ lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t
         lwt_put_string(where);
 
         return (0);
+#undef XFMT
 }
 
 double
index 828db61..6319775 100755 (executable)
@@ -23,7 +23,7 @@
 # lconf is the main driver script for starting and stopping
 # lustre filesystem services.
 #
-# Based in part on the XML obdctl modifications done by Brian Behlendorf
+# Based in part on the XML obdctl modifications done by Brian Behlendorf 
 
 import sys, getopt, types
 import string, os, stat, popen2, socket, time, random, fcntl, select
@@ -61,10 +61,10 @@ MAX_LOOP_DEVICES = 256
 PORTALS_DIR = 'portals'
 
 # Needed to call lconf --record
-CONFIG_FILE = ""
+CONFIG_FILE = "" 
 
 # Please keep these in sync with the values in portals/kp30.h
-ptldebug_names = {
+ptldebug_names = { 
     "trace" :     (1 << 0),
     "inode" :     (1 << 1),
     "super" :     (1 << 2),
@@ -88,8 +88,8 @@ ptldebug_names = {
     "rpctrace" :  (1 << 20),
     "vfstrace" :  (1 << 21),
     "reada" :     (1 << 22),
-    "config" :    (1 << 23),
-    "mmap" :      (1 << 24),    
+    "mmap" :      (1 << 23),
+    "config" :    (1 << 24),
 }
 
 subsystem_names = {
@@ -115,8 +115,11 @@ subsystem_names = {
     "gmnal" :        (1 << 19),
     "ptlrouter" :    (1 << 20),
     "cobd" :         (1 << 21),
-    "openibnal" :    (1 << 22),
-    "cmobd" :        (1 << 23),
+    "ibnal" :        (1 << 22),
+    "sm" :           (1 << 23),
+    "asobd" :        (1 << 24),
+    "lmv" :          (1 << 25),
+    "cmobd" :        (1 << 26),
     }
 
 
@@ -126,7 +129,7 @@ def cleanup_error(rc):
     if not first_cleanup_error:
         first_cleanup_error = rc
 
-# ============================================================
+# ============================================================ 
 # debugging and error funcs
 
 def fixme(msg = "this feature"):
@@ -243,7 +246,7 @@ class DaemonHandler:
             return pid
         except IOError:
             return 0
-
+        
     def clean_pidfile(self):
         """ Remove a stale pidfile """
         log("removing stale pidfile:", self.pidfile())
@@ -251,7 +254,7 @@ class DaemonHandler:
             os.unlink(self.pidfile())
         except OSError, e:
             log(self.pidfile(), e)
-
+            
 class AcceptorHandler(DaemonHandler):
     def __init__(self, port, net_type):
         DaemonHandler.__init__(self, "acceptor")
@@ -262,8 +265,8 @@ class AcceptorHandler(DaemonHandler):
         return "/var/run/%s-%d.pid" % (self.command, self.port)
 
     def command_line(self):
-       return string.join(map(str,(self.flags, self.port)))
-
+        return string.join(map(str,(self.flags, self.port)))
+    
 acceptors = {}
 
 # start the acceptors
@@ -283,14 +286,14 @@ def run_one_acceptor(port):
         if not daemon.running():
             daemon.start()
     else:
-         panic("run_one_acceptor: No acceptor defined for port:", port)
-
+         panic("run_one_acceptor: No acceptor defined for port:", port)   
+        
 def stop_acceptor(port):
     if acceptors.has_key(port):
         daemon = acceptors[port]
         if daemon.running():
             daemon.stop()
-
+        
 
 # ============================================================
 # handle lctl interface
@@ -315,7 +318,7 @@ class LCTLInterface:
 
     def use_save_file(self, file):
         self.save_file = file
-
+        
     def record(self, dev_name, logname):
         log("Recording log", logname, "on", dev_name)
         self.record_device = dev_name
@@ -347,7 +350,7 @@ class LCTLInterface:
     device $%s
     record %s
     %s""" % (self.record_device, self.record_log, cmds)
-
+            
         debug("+", cmd_line, cmds)
         if config.noexec: return (0, [])
 
@@ -399,7 +402,7 @@ class LCTLInterface:
             raise CommandError(self.lctl, out, rc)
         return rc, out
 
-
+            
     def clear_log(self, dev, log):
         """ clear an existing log """
         cmds =  """
@@ -409,6 +412,13 @@ class LCTLInterface:
   quit """ % (dev, log)
         self.run(cmds)
 
+    def root_squash(self, name, uid, nid):
+        cmds = """
+  device $%s
+  root_squash %s %s
+  quit""" % (name, uid, nid)
+        self.run(cmds)
+
     def network(self, net, nid):
         """ set mynid """
         cmds =  """
@@ -417,11 +427,22 @@ class LCTLInterface:
   quit """ % (net, nid)
         self.run(cmds)
 
-    def root_squash(self, name, uid, nid):
+    # add an interface
+    def add_interface(self, net, ip, netmask = ""):
+        """ add an interface """
         cmds = """
-  device $%s
-  root_squash %s %s
-  quit""" % (name, uid, nid)
+  network %s
+  add_interface %s %s
+  quit """ % (net, ip, netmask)
+        self.run(cmds)
+
+    # delete an interface
+    def del_interface(self, net, ip):
+        """ delete an interface """
+        cmds = """
+  network %s
+  del_interface %s
+  quit """ % (net, ip)
         self.run(cmds)
 
     # create a new connection
@@ -429,26 +450,28 @@ class LCTLInterface:
         cmds = "\n  add_uuid %s %s %s" %(uuid, nid, net_type)
         self.run(cmds)
 
-    def add_peer(self, net_type, nid, hostaddr, port):     
-       if net_type  in ('tcp',) and not config.lctl_dump:
+    def add_peer(self, net_type, nid, hostaddr, port):
+        if net_type  in ('tcp',) and not config.lctl_dump:
             cmds =  """
   network %s
   add_peer %s %s %d
   quit""" % (net_type,
              nid, hostaddr, port )
             self.run(cmds)
-        elif net_type in ('openib',) and not config.lctl_dump:
+        elif net_type in ('openib','iib',) and not config.lctl_dump:
             cmds =  """
   network %s
   add_peer %s
   quit""" % (net_type,
-             nid)
-           self.run(cmds)
-
+             nid )
+            self.run(cmds)
+    
     def connect(self, srv):
         self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
-        if srv.net_type  in ('tcp','openib',) and not config.lctl_dump:
-            self.add_peer(srv.net_type, srv.nid, srv.hostaddr, srv.port)
+        if srv.net_type  in ('tcp','openib','iib',) and not config.lctl_dump:
+            if srv.hostaddr[0]:
+                hostaddr = string.split(srv.hostaddr[0], '/')[0]
+            self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
 
     # Recover a device
     def recover(self, dev_name, new_conn):
@@ -456,7 +479,7 @@ class LCTLInterface:
     device $%s
     recover %s""" %(dev_name, new_conn)
         self.run(cmds)
-
+                
     # add a route to a range
     def add_route(self, net, gw, lo, hi):
         cmds =  """
@@ -469,7 +492,7 @@ class LCTLInterface:
         except CommandError, e:
             log ("ignore: ")
             e.dump()
-
+                
     def del_route(self, net, gw, lo, hi):
         cmds =  """
   ignore_errors
@@ -502,6 +525,7 @@ class LCTLInterface:
   quit  """ % (net, gw, tgt)
         self.run(cmds)
 
+
     def del_peer(self, net_type, nid, hostaddr):
         if net_type  in ('tcp',) and not config.lctl_dump:
                 cmds =  """
@@ -511,7 +535,7 @@ class LCTLInterface:
   quit""" % (net_type,
              nid, hostaddr)
                 self.run(cmds)
-        elif net_type  in ('openib',) and not config.lctl_dump:
+        elif net_type  in ('openib','iib',) and not config.lctl_dump:
                 cmds =  """
   ignore_errors
   network %s
@@ -519,12 +543,14 @@ class LCTLInterface:
   quit""" % (net_type,
              nid)
                 self.run(cmds)
-
+        
     # disconnect one connection
     def disconnect(self, srv):
         self.del_uuid(srv.nid_uuid)
-        if srv.net_type  in ('tcp','openib',) and not config.lctl_dump:
-            self.del_peer(srv.net_type, srv.nid, srv.hostaddr)
+        if srv.net_type  in ('tcp','openib','iib',) and not config.lctl_dump:
+            if srv.hostaddr[0]:
+                hostaddr = string.split(srv.hostaddr[0], '/')[0]
+            self.del_peer(srv.net_type, srv.nid, hostaddr)
 
     def del_uuid(self, uuid):
         cmds =  """
@@ -554,7 +580,7 @@ class LCTLInterface:
   setup %s
   quit""" % (name, setup)
         self.run(cmds)
-
+        
     def add_conn(self, name, conn_uuid):
         cmds = """
   cfg_device %s
@@ -571,7 +597,7 @@ class LCTLInterface:
         except CommandError, e:
             self.cleanup(name, uuid, 0)
             raise e
-
+        
 
     # cleanup a device
     def cleanup(self, name, uuid, force, failover = 0):
@@ -591,8 +617,8 @@ class LCTLInterface:
         cmds = """
   attach lov %s %s
   lov_setup %s %d %d %d %s %s
-  quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, 
-            pattern, devlist)
+  quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
+             pattern, devlist)
         self.run(cmds)
 
     # add an OBD to a LOV
@@ -741,7 +767,7 @@ def find_module(src_dir, dev_dir, modname):
     modbase = src_dir +'/'+ dev_dir +'/'+ modname
     for modext in '.ko', '.o':
         module = modbase + modext
-        try:
+        try: 
             if os.access(module, os.R_OK):
                 return module
         except OSError:
@@ -770,8 +796,6 @@ def jdev(opts):
         i=i+1
     return ''
 
-
-    
 # build fs according to type
 # fixme: dangerous
 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
@@ -789,7 +813,7 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
         # ext3 journal size is in megabytes
         # but don't set jsize if mkfsoptions indicates a separate journal device
         if jsize == 0 and jdev(mkfsoptions) == '':
-           if devsize == 0:
+            if devsize == 0:
                 if not is_block(dev):
                     ret, out = runcmd("ls -l %s" %dev)
                     devsize = int(string.split(out[0])[4]) / 1024
@@ -801,7 +825,7 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
                     else:
                         # sfdisk -s will fail for too large block device,
                         # then, read the size of partition from /proc/partitions
-                                                                                                               
+
                         # get the realpath of the device
                         # it may be the real device, such as /dev/hda7
                         # or the hardlink created via mknod for a device
@@ -819,14 +843,14 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
                                     real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
                                 if link_count > 19:
                                     panic("Entountered too many symbolic links resolving block device:", dev)
-                                                                                                               
+
                         # get the major and minor number of the realpath via ls
-                        # it seems python(os.stat) does not return
+                        # it seems python(os.stat) does not return 
                         # the st_rdev member of the stat structure
                         ret, out = runcmd("ls -l %s" %real_dev)
                         major = string.split(string.split(out[0])[4], ",")[0]
                         minor = string.split(out[0])[5]
-                                                                                                               
+
                         # get the devsize from /proc/partitions with the major and minor number
                         ret, out = runcmd("cat /proc/partitions")
                         for line in out:
@@ -838,7 +862,7 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
             if devsize > 1024 * 1024:
                 jsize = ((devsize / 102400) * 4)
             if jsize > 400:
-                jsize = 400
+                jsize = 400        
         if jsize:  jopt = "-J size=%d" %(jsize,)
         if isize:  iopt = "-I %d" %(isize,)
         mkfs = 'mkfs.ext2 -j -b 4096 '
@@ -848,11 +872,10 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
             jmkfs = 'mkfs.ext2 -b 4096 -O journal_dev '
             if config.force:
                 jmkfs = jmkfs + '-F '
-            jmkfs = jmkfs + jdev(mkfsoptions)
+            jmkfs = jmkfs + jdev(mkfsoptions)                
             (ret, out) = run (jmkfs)
             if ret:
                 panic("Unable format journal device:", jdev(mkfsoptions), string.join(out))
-
     elif fstype == 'reiserfs':
         # reiserfs journal size is in blocks
         if jsize:  jopt = "--journal_size %d" %(jsize,)
@@ -954,7 +977,7 @@ def clean_loop(file):
 
 # determine if dev is formatted as a <fstype> filesystem
 def need_format(fstype, dev):
-    # FIXME don't know how to implement this
+    # FIXME don't know how to implement this    
     return 0
 
 # initialize a block device if needed
@@ -973,7 +996,7 @@ def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
 #        panic("device:", dev,
 #              "not prepared, and autoformat is not set.\n",
 #              "Rerun with --reformat option to format ALL filesystems")
-
+        
     return dev
 
 def if2addr(iface):
@@ -1011,11 +1034,11 @@ def sys_get_local_nid(net_type, wildcard, cluster_id):
     else:
         local = sys_get_local_address(net_type, wildcard, cluster_id)
     return local
-
+        
 def sys_get_local_address(net_type, wildcard, cluster_id):
     """Return the local address for the network type."""
     local = ""
-    if net_type in ('tcp','openib',):   
+    if net_type in ('tcp','openib','iib',):
         if  ':' in wildcard:
             iface, star = string.split(wildcard, ':')
             local = if2addr(iface)
@@ -1039,7 +1062,7 @@ def sys_get_local_address(net_type, wildcard, cluster_id):
                     elan_id = a[1]
                     break
             try:
-                nid = my_int(cluster_id) + my_int(elan_id)
+                nid = my_int(cluster_id) + my_int(elan_id) 
                 local = "%d" % (nid)
             except ValueError, e:
                 local = elan_id
@@ -1116,7 +1139,7 @@ def fs_is_mounted(path):
     except IOError, e:
         log(e)
     return 0
-
+        
 
 class kmod:
     """Manage kernel modules"""
@@ -1140,7 +1163,7 @@ class kmod:
                 continue
             log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
             if src_dir:
-                module = find_module(src_dir, dev_dir,  mod)
+                module = find_module(src_dir, dev_dir, mod)
                 if not module:
                     panic('module not found:', mod)
                 (rc, out)  = run('/sbin/insmod', module)
@@ -1182,7 +1205,7 @@ class Module:
         self._server = None
         self._connected = 0
         self.kmod = kmod(config.lustre, config.portals)
-
+        
     def info(self, *args):
         msg = string.join(map(str,args))
         print self.module_name + ":", self.name, self.uuid, msg
@@ -1196,7 +1219,7 @@ class Module:
             log(self.module_name, "cleanup failed: ", self.name)
             e.dump()
             cleanup_error(e.rc)
-
+            
     def add_portals_module(self, dev_dir, modname):
         """Append a module to list of modules to load."""
         self.kmod.add_portals_module(dev_dir, modname)
@@ -1208,7 +1231,7 @@ class Module:
     def load_module(self):
         """Load all the modules in the list in the order they appear."""
         self.kmod.load_module()
-
+            
     def cleanup_module(self):
         """Unload the modules in the list in reverse order."""
         if self.safe_to_clean():
@@ -1216,10 +1239,10 @@ class Module:
 
     def safe_to_clean(self):
         return 1
-
+        
     def safe_to_clean_modules(self):
         return self.safe_to_clean()
-
+        
 class Network(Module):
     def __init__(self,db):
         Module.__init__(self, 'NETWORK', db)
@@ -1239,12 +1262,14 @@ class Network(Module):
 
         self.nid_uuid = self.nid_to_uuid(self.nid)
 
-        self.hostaddr = self.db.get_val('hostaddr', self.nid)
-        if '*' in self.hostaddr:
-            self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
-            if not self.hostaddr:
-                panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
-            debug("hostaddr:", self.hostaddr)
+        self.hostaddr = self.db.get_hostaddr()
+        if len(self.hostaddr) == 0:
+            self.hostaddr.append(self.nid)
+        if '*' in self.hostaddr[0]:
+            self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
+            if not self.hostaddr[0]:
+                panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
+            debug("hostaddr:", self.hostaddr[0])
 
         self.add_portals_module("libcfs", 'libcfs')
         self.add_portals_module("portals", 'portals')
@@ -1258,6 +1283,8 @@ class Network(Module):
             self.add_portals_module("knals/gmnal", 'kgmnal')
         if self.net_type == 'openib':
             self.add_portals_module("knals/openibnal", 'kopenibnal')
+        if self.net_type == 'iib':
+            self.add_portals_module("knals/iibnal", 'kiibnal')
 
     def nid_to_uuid(self, nid):
         return "NID_%s_UUID" %(nid,)
@@ -1270,6 +1297,13 @@ class Network(Module):
             lctl.network(self.net_type, self.nid)
         if self.net_type == 'tcp':
             sys_tweak_socknal()
+            for hostaddr in self.db.get_hostaddr():
+                ip = string.split(hostaddr, '/')[0]
+                if len(string.split(hostaddr, '/')) == 2:
+                    netmask = string.split(hostaddr, '/')[1]
+                else:
+                    netmask = ""
+                lctl.add_interface(self.net_type, ip, netmask)
         if self.net_type == 'elan':
             sys_optimize_elan()
         if self.port and  node_is_router():
@@ -1312,6 +1346,10 @@ class Network(Module):
             stop_acceptor(self.port)
         if  node_is_router():
             self.disconnect_peer_gateways()
+        if self.net_type == 'tcp':
+            for hostaddr in self.db.get_hostaddr():
+                ip = string.split(hostaddr, '/')[0]
+                lctl.del_interface(self.net_type, ip)
 
     def correct_level(self, level, op=None):
         return level
@@ -1322,10 +1360,9 @@ class RouteTable(Module):
 
     def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
                          lo, hi):
-        # only setup connections for tcp and openib NALs
-       srvdb = None
-        
-       if not net_type in ('tcp','openib'):
+        # only setup connections for tcp, openib, and iib NALs
+        srvdb = None
+        if not net_type in ('tcp','openib','iib',):
             return None
 
         # connect to target if route is to single node and this node is the gw
@@ -1345,7 +1382,7 @@ class RouteTable(Module):
             return None
 
         return Network(srvdb)
-
+        
     def prepare(self):
         if not config.record and is_network_prepared():
             return
@@ -1412,7 +1449,7 @@ class LOV(Module):
         self.devlist = self.db.get_lov_tgts('lov_tgt')
         self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
         self.osclist = []
-       self.obdlist = [] 
+       self.obdlist = []
         self.desc_uuid = self.uuid
         self.uuid = generate_client_uuid(self.name)
         self.fs_name = fs_name
@@ -1506,7 +1543,7 @@ class LMV(Module):
                 self.mdclist.append(mdc)
             else:
                 panic('mdc not found:', mds_uuid)
-            
+
     def prepare(self):
         if is_prepared(self.name):
             return
@@ -1600,7 +1637,7 @@ class MDSDEV(Module):
 
                if not self.lmv:
                    panic("No LMV initialized and not lovconfig_uuid found")
-                   
+               
                 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
                 lovconfig = self.lmv.lookup(lovconfig_uuid)
                 lov_uuid = lovconfig.get_first_ref('lov')
@@ -1624,8 +1661,7 @@ class MDSDEV(Module):
                 stripe_count = lov.stripe_cnt
             else:
                 stripe_count = len(lov.devlist)
-
-           if stripe_count > 77:
+            if stripe_count > 77:
                 self.inode_size = 4096
             elif stripe_count > 35:
                 self.inode_size = 2048
@@ -1655,14 +1691,14 @@ class MDSDEV(Module):
 
         if self.fstype == 'smfs':
             self.add_lustre_module('smfs', 'smfs')
-        
+
         if self.fstype == 'ldiskfs':
             self.add_lustre_module('ldiskfs', 'ldiskfs')
 
         if self.fstype:
             self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
-            
-        # if fstype is smfs, then we should also take care about backing 
+
+        # if fstype is smfs, then we should also take care about backing
         # store fs.
         if self.fstype == 'smfs':
             self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
@@ -1695,12 +1731,12 @@ class MDSDEV(Module):
         blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
                            self.format, self.journal_size, self.inode_size,
                            self.mkfsoptions, self.backfstype, self.backdevpath)
-        
+
         if not is_prepared('MDT'):
             lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
-        try: 
+        try:
             mountfsoptions = def_mount_options(self.fstype, 'mds')
-            
+
             if config.mountfsoptions:
                 if mountfsoptions:
                     mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
@@ -1714,28 +1750,28 @@ class MDSDEV(Module):
                         mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
                     else:
                         mountfsoptions = self.mountfsoptions
-            
+
             if self.fstype == 'smfs':
                 realdev = self.fstype
-                
+
                 if mountfsoptions:
-                    mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions, 
-                                                            self.backfstype, 
+                    mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
+                                                            self.backfstype,
                                                             blkdev)
                 else:
-                    mountfsoptions = "type=%s,dev=%s" % (self.backfstype, 
+                    mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
                                                          blkdev)
             else:
                 realdev = blkdev
-                
+
             print 'MDS mount options: ' + mountfsoptions
-            
+
            if not self.master_mds:
-                self.master_mds = 'dumb'           
+                self.master_mds = 'dumb'       
             if not self.cachetype:
                 self.cachetype = 'dumb'
            lctl.newdev("mds", self.name, self.uuid,
-                        setup ="%s %s %s %s %s %s" %(realdev, self.fstype, 
+                        setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
                                                self.name, mountfsoptions,
                                                self.master_mds, self.cachetype))
 
@@ -1796,28 +1832,28 @@ class MDSDEV(Module):
 
             if self.fstype == 'smfs':
                 realdev = self.fstype
-                
+
                 if mountfsoptions:
-                    mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions, 
-                                                            self.backfstype, 
+                    mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
+                                                            self.backfstype,
                                                             blkdev)
                 else:
-                    mountfsoptions = "type=%s,dev=%s" % (self.backfstype, 
+                    mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
                                                          blkdev)
             else:
                 realdev = blkdev
        
                 print 'MDS mount options: ' + mountfsoptions
 
-            # As mount options are passed by 4th param to config tool, we need 
+            # As mount options are passed by 4th param to config tool, we need
             # to pass something in 3rd param. But we do not want this 3rd param
             # be counted as a profile name for reading log on MDS setup, thus,
-            # we pass there some predefined sign like 'dumb', which will be 
+            # we pass there some predefined sign like 'dumb', which will be
             # checked in MDS code and skipped. Probably there is more nice way
             # like pass empty string and check it in config tool and pass null
             # as 4th param.
             lctl.newdev("mds", self.name, self.uuid,
-                        setup ="%s %s %s %s" %(realdev, self.fstype, 
+                        setup ="%s %s %s %s" %(realdev, self.fstype,
                                                'dumb', mountfsoptions))
             do_cleanup = 1
 
@@ -1829,14 +1865,14 @@ class MDSDEV(Module):
             # this is ugly, should be organized nice later.
             target_uuid = self.db.get_first_ref('target')
             mds = self.db.lookup(target_uuid)
-            
+
             lovconfig_uuid = mds.get_first_ref('lovconfig')
             if lovconfig_uuid:
                 lovconfig = mds.lookup(lovconfig_uuid)
                 obd_uuid = lovconfig.get_first_ref('lov')
             else:
                 obd_uuid = fs.get_first_ref('obd')
-                
+
             client_uuid = generate_client_uuid(self.name)
             client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
                           self.name)
@@ -1947,7 +1983,7 @@ class MDSDEV(Module):
                 print "cleanup failed: ", self.name
                 e.dump()
                 cleanup_error(e.rc)
-        
+
         if self.fstype == 'smfs':
             clean_loop(self.backdevpath)
         else:
@@ -1990,7 +2026,7 @@ class OSD(Module):
             self.active = 0
         if self.active and config.group and config.group != ost.get_val('group'):
             self.active = 0
-
+            
         self.target_dev_uuid = self.uuid
         self.uuid = target_uuid
         # modules
@@ -2039,7 +2075,7 @@ class OSD(Module):
                                self.backdevpath)
 
         mountfsoptions = def_mount_options(self.fstype, 'ost')
-            
+
         if config.mountfsoptions:
             if mountfsoptions:
                 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
@@ -2053,25 +2089,25 @@ class OSD(Module):
                     mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
                 else:
                     mountfsoptions = self.mountfsoptions
-            
+
         if self.fstype == 'smfs':
             realdev = self.fstype
-                
+
             if mountfsoptions:
-                mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions, 
-                                                        self.backfstype, 
+                mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
+                                                        self.backfstype,
                                                         blkdev)
             else:
-                mountfsoptions = "type=%s,dev=%s" % (self.backfstype, 
+                mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
                                                      blkdev)
         else:
             realdev = blkdev
-                
+
         print 'OSD mount options: ' + mountfsoptions
-        
+
         lctl.newdev(self.osdtype, self.name, self.uuid,
                     setup ="%s %s %s %s" %(realdev, self.fstype,
-                                           self.failover_ost, 
+                                           self.failover_ost,
                                            mountfsoptions))
         if not is_prepared('OSS'):
             lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
@@ -2127,7 +2163,7 @@ class Client(Module):
         self.db = tgtdb
         self.active = 1
        self.backup_targets = []
-        
+
        self.tgt_dev_uuid = get_active_target(tgtdb)
         if not self.tgt_dev_uuid:
             panic("No target device found for target(1):", self.target_name)
@@ -2274,15 +2310,15 @@ class VLOV(Module):
         if name_override != None:
             self.name = "lov_%s" % name_override
         self.add_lustre_module('lov', 'lov')
-        self.stripe_sz = 65536 
-        self.stripe_off = 0 
+        self.stripe_sz = 65536
+        self.stripe_off = 0
         self.pattern =  0
-        self.stripe_cnt = 1 
+        self.stripe_cnt = 1
         self.desc_uuid = self.uuid
         self.uuid = generate_client_uuid(self.name)
         self.fs_name = fs_name
         self.osc = get_osc(db, self.uuid, fs_name)
-        if not self.osc:        
+        if not self.osc:
            panic('osc not found:', self.uuid)
        if config_only:
             self.config_only = 1
@@ -2299,7 +2335,7 @@ class VLOV(Module):
                        self.stripe_sz, self.stripe_off, self.pattern)
         target_uuid = self.osc.target_uuid
         try:
-           self.osc.active = 1 
+           self.osc.active = 1
             self.osc.prepare(ignore_connect_failure=0)
         except CommandError, e:
             print "Error preparing OSC %s\n" % osc.uuid
@@ -2332,7 +2368,7 @@ class VLOV(Module):
 class CMOBD(Module):
     def __init__(self,db):
        Module.__init__(self, 'CMOBD', db)
-       self.name = self.db.getName(); 
+       self.name = self.db.getName();
        self.uuid = generate_client_uuid(self.name)
        self.master_uuid = self.db.get_first_ref('masterobd')
        self.cache_uuid = self.db.get_first_ref('cacheobd')
@@ -2345,12 +2381,12 @@ class CMOBD(Module):
            panic('cache obd not found:', self.cache_uuid)
        
        if master_obd.get_class() == 'ost':
-           self.client_uuid = generate_client_uuid(self.name) 
-           self.master= VLOV(master_obd, self.client_uuid, self.name, 
+           self.client_uuid = generate_client_uuid(self.name)
+           self.master= VLOV(master_obd, self.client_uuid, self.name,
                            "%s_master" % (self.name))
            self.master_uuid = self.master.get_uuid()
        else:
-           self.master = get_mdc(db, self.name, self.master_uuid) 
+           self.master = get_mdc(db, self.name, self.master_uuid)
     # need to check /proc/mounts and /etc/mtab before
     # formatting anything.
     # FIXME: check if device is already formatted.
@@ -2375,14 +2411,14 @@ class CMOBD(Module):
     def cleanup_module(self):
         Module.cleanup_module(self)
         self.master.cleanup_module()
-                                                                                                                                                                                                     
+
     def correct_level(self, level, op=None):
         return level
 
 class COBD(Module):
     def __init__(self, db, uuid, name, type, name_override = None):
         Module.__init__(self, 'COBD', db)
-        self.name = self.db.getName(); 
+        self.name = self.db.getName();
         self.uuid = generate_client_uuid(self.name)
         self.real_uuid = self.db.get_first_ref('realobd')
         self.cache_uuid = self.db.get_first_ref('cacheobd')
@@ -2394,13 +2430,13 @@ class COBD(Module):
         if not cache_obd:
             panic('cache obd not found:', self.cache_uuid)
         if type == 'obd':
-            self.real = LOV(real_obd, self.real_uuid, name, 
+            self.real = LOV(real_obd, self.real_uuid, name,
                             "%s_real" % (self.name));
-            self.cache = LOV(cache_obd, self.cache_uuid, name, 
+            self.cache = LOV(cache_obd, self.cache_uuid, name,
                             "%s_cache" % (self.name));
         else:
-            self.real = get_mdc(db,  name, self.real_uuid) 
-            self.cache = get_mdc(db, name, self.cache_uuid) 
+            self.real = get_mdc(db,  name, self.real_uuid)
+            self.cache = get_mdc(db, name, self.cache_uuid)
     # need to check /proc/mounts and /etc/mtab before
     # formatting anything.
     # FIXME: check if device is already formatted.
@@ -2565,18 +2601,18 @@ class Mountpoint(Module):
         ost = self.db.lookup(self.obd_uuid)
         if not ost:
             panic("no ost: ", self.obd_uuid)
-            
+
         mds = self.db.lookup(self.mds_uuid)
        if not mds:
            panic("no mds: ", self.mds_uuid)
-       
+
         self.add_lustre_module('mdc', 'mdc')
         self.add_lustre_module('lmv', 'lmv')
         self.add_lustre_module('llite', 'llite')
-        
+
         self.vosc = VOSC(ost, client_uuid, self.name)
        self.vmdc = VMDC(mds, client_uuid, self.name)
-        
+
     def prepare(self):
         if not config.record and fs_is_mounted(self.path):
             log(self.path, "already mounted.")
@@ -2601,16 +2637,16 @@ class Mountpoint(Module):
             self.clientoptions = ',' + self.clientoptions
             # Linux kernel will deal with async and not pass it to ll_fill_super,
             # so replace it with Lustre async
-            self.clientoptions = string.replace(self.clientoptions, "async", 
+            self.clientoptions = string.replace(self.clientoptions, "async",
                                                "lasync")
 
         cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
-              (self.vosc.get_name(), vmdc_name, self.clientoptions, 
+              (self.vosc.get_name(), vmdc_name, self.clientoptions,
               config.config, self.path)
         run("mkdir", self.path)
         ret, val = run(cmd)
         if ret:
-            self.vmdc.cleanup()            
+            self.vmdc.cleanup()
             self.vosc.cleanup()
             panic("mount failed:", self.path, ":", string.join(val))
 
@@ -2664,7 +2700,7 @@ def get_ost_net(self, osd_uuid):
     return srv_list
 
 
-# the order of iniitailization is based on level.
+# the order of iniitailization is based on level. 
 def getServiceLevel(self):
     type = self.get_class()
     ret=0;
@@ -2681,7 +2717,7 @@ def getServiceLevel(self):
     elif type in ('lmv',):
         ret = 45
     elif type in ('cmobd',):
-        ret = 50 
+        ret = 50
     elif type in ('mountpoint', 'echoclient'):
         ret = 70
     else:
@@ -2728,6 +2764,7 @@ def get_mdc(db, fs_name, mds_uuid):
 
 ############################################################
 # routing ("rooting")
+
 # list of (nettype, cluster_id, nid)
 local_clusters = []
 
@@ -2741,7 +2778,7 @@ def find_local_clusters(node_db):
         if srv.port > 0:
             if acceptors.has_key(srv.port):
                 panic("duplicate port:", srv.port)
-           acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
+            acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
 
 # This node is a gateway.
 is_router = 0
@@ -2817,7 +2854,7 @@ def find_route(srv_list):
             if  (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
                 result.append((srv, r))
     return result
-
+           
 def get_active_target(db):
     target_uuid = db.getUUID()
     target_name = db.getName()
@@ -2833,7 +2870,7 @@ def get_server_by_nid_uuid(db,  nid_uuid):
         net = Network(n)
         if net.nid_uuid == nid_uuid:
             return net
-
+        
 
 ############################################################
 # lconf level logic
@@ -2870,7 +2907,7 @@ def newService(db):
 
 #
 # Prepare the system to run lustre using a particular profile
-# in a the configuration.
+# in a the configuration. 
 #  * load & the modules
 #  * setup networking for the current node
 #  * make sure partitions are in place and prepared
@@ -2881,7 +2918,7 @@ def for_each_profile(db, prof_list, operation):
         prof_db = db.lookup(prof_uuid)
         if not prof_db:
             panic("profile:", prof_uuid, "not found.")
-       services = getServices(prof_db)
+        services = getServices(prof_db)
         operation(services)
 
 def magic_get_osc(db, rec, lov):
@@ -3077,7 +3114,7 @@ def doUnloadModules(services):
             n.cleanup_module()
 
 #
-# Load profile for
+# Load profile for 
 def doHost(lustreDB, hosts):
     global is_router, local_node_name
     node_db = None
@@ -3095,7 +3132,7 @@ def doHost(lustreDB, hosts):
     timeout = node_db.get_val_int('timeout', 0)
     ptldebug = node_db.get_val('ptldebug', '')
     subsystem = node_db.get_val('subsystem', '')
-
+    
     find_local_clusters(node_db)
     if not is_router:
         find_local_routes(lustreDB)
@@ -3202,7 +3239,7 @@ def setupModulePath(cmd, portals_dir = PORTALS_DIR):
     base = os.path.dirname(cmd)
     if development_mode():
         if not config.lustre:
-            debug('using objdir module paths')
+            debug('using objdir module paths')            
             config.lustre = (os.path.join(base, ".."))
         # normalize the portals dir, using command line arg if set
         if config.portals:
@@ -3212,7 +3249,7 @@ def setupModulePath(cmd, portals_dir = PORTALS_DIR):
         debug('config.portals', config.portals)
     elif config.lustre and config.portals:
         # production mode
-        # if --lustre and --portals, normalize portals
+        # if --lustre and --portals, normalize portals 
         # can ignore POTRALS_DIR here, since it is probly useless here
         config.portals = os.path.join(config.lustre, config.portals)
         debug('config.portals B', config.portals)
@@ -3320,8 +3357,8 @@ def sys_set_netmem_max(path, max):
         fp = open(path, 'w')
         fp.write('%d\n' %(max))
         fp.close()
-
-
+    
+    
 def sys_make_devices():
     if not os.access('/dev/portals', os.R_OK):
         run('mknod /dev/portals c 10 240')
@@ -3335,7 +3372,7 @@ def add_to_path(new_dir):
     if new_dir in syspath:
         return
     os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
-
+    
 def default_debug_path():
     path = '/tmp/lustre-log'
     if os.path.isdir('/r'):
@@ -3418,7 +3455,7 @@ lconf_options = [
               PARAM),
     ('minlevel', "Minimum level of services to configure/cleanup",
                  INTPARAM, 0),
-    ('maxlevel', """Maximum level of services to configure/cleanup
+    ('maxlevel', """Maximum level of services to configure/cleanup 
                     Levels are aproximatly like:
                             10 - netwrk
                             20 - device, ldlm
@@ -3449,14 +3486,14 @@ lconf_options = [
     ('inactive', """The name of an inactive service, to be ignored during
                     mounting (currently OST-only). Can be repeated.""",
                 PARAMLIST),
-    ]
+    ]      
 
 def main():
     global lctl, config, toplustreDB, CONFIG_FILE
 
     # in the upcall this is set to SIG_IGN
     signal.signal(signal.SIGCHLD, signal.SIG_DFL)
-
+    
     cl = Lustre.Options("lconf", "config.xml", lconf_options)
     try:
         config, args = cl.parse(sys.argv[1:])
@@ -3479,7 +3516,7 @@ def main():
     random.seed(seed)
 
     sanitise_path()
-
+    
     init_select(config.select)
 
     if len(args) > 0:
index 3fea4e2..e8e5f10 100755 (executable)
@@ -73,9 +73,9 @@ Object creation command summary:
 --add net
   --node node_name
   --nid nid
-  --cluster_id
-  --nettype tcp|elan|gm|openib
-  --hostaddr addr
+  --cluster_id 
+  --nettype tcp|elan|gm|openib|iib
+  --hostaddr ip[/netmask]
   --port port
   --tcpbuf size
   --irq_affinity 0|1
@@ -176,6 +176,7 @@ Object creation command summary:
 """
 
 PARAM = Lustre.Options.PARAM
+PARAMLIST = Lustre.Options.PARAMLIST
 lmc_options = [
     # lmc input/output options
     ('reference', "Print short reference for commands."),
@@ -200,11 +201,11 @@ lmc_options = [
     ('ptldebug', "Set the portals debug level",  PARAM),
     ('subsystem', "Specify which Lustre subsystems have debug output recorded in the log",  PARAM),
 
-    # network
-    ('nettype', "Specify the network type. This can be tcp/elan/gm/openib.", PARAM),
+    # network 
+    ('nettype', "Specify the network type. This can be tcp/elan/gm/openib/iib.", PARAM),
     ('nid', "Give the network ID, e.g ElanID/IP Address as used by portals.", PARAM),
     ('port', "Optional argument to specify the TCP port number.", PARAM, DEFAULT_PORT),
-    ('hostaddr', "", PARAM,""),
+    ('hostaddr', "Optional argument to specify the host address.", PARAMLIST),
     ('cluster_id', "Specify the cluster ID", PARAM, "0"),
 
     # routes
@@ -392,8 +393,8 @@ class GenConfig:
         network.setAttribute("nettype", net);
         self.addElement(network, "nid", nid)
         self.addElement(network, "clusterid", cluster_id)
-        if hostaddr:
-            self.addElement(network, "hostaddr", hostaddr)
+        for host in  hostaddr:
+            self.addElement(network, "hostaddr", host)
         if port:
             self.addElement(network, "port", "%d" %(port))
         
@@ -922,7 +923,7 @@ def add_net(gen, lustre, options):
 
     if net_type in ('tcp',):
         port = get_option_int(options, 'port')
-    elif net_type in ('elan', 'gm', 'openib'):
+    elif net_type in ('elan', 'gm', 'openib','iib'):
         port = 0
     else:
         print "Unknown net_type: ", net_type
index 2cf74f9..765793b 100644 (file)
@@ -2308,6 +2308,7 @@ int jt_llog_cancel(int argc, char **argv)
         return rc;
 
 }
+
 int jt_llog_check(int argc, char **argv)
 {
         struct obd_ioctl_data data;
@@ -2375,6 +2376,7 @@ int jt_llog_remove(int argc, char **argv)
 
         return rc;
 }
+
 int jt_obd_reint_sync(int argc, char **argv)
 {
         struct obd_ioctl_data data;
@@ -2410,6 +2412,7 @@ int jt_obd_cache_on(int argc, char **argv)
         return rc;  
                
 }
+
 int jt_obd_cache_off(int argc, char **argv)
 {
         struct obd_ioctl_data data;
@@ -2425,8 +2428,13 @@ int jt_obd_cache_off(int argc, char **argv)
                         rc);
         return rc;  
 }
+
 int jt_obd_snap_add(int argc, char **argv)
 {
+#if 1
+        return -1;
+#else
+# error "FIX the missing #defines before committing"        
         struct obd_ioctl_data data;
         int rc = 0;
       
@@ -2452,7 +2460,9 @@ int jt_obd_snap_add(int argc, char **argv)
         if (rc)
                 fprintf(stderr, "OBD_IOC_SNAP_ADD failed: rc=%d\n", rc);
         return rc;
+#endif
 }
+
 static void signal_server(int sig)
 {
         if (sig == SIGINT) {