Whamcloud - gitweb
* First cut working vibnal
authoreeb <eeb>
Wed, 23 Feb 2005 21:43:25 +0000 (21:43 +0000)
committereeb <eeb>
Wed, 23 Feb 2005 21:43:25 +0000 (21:43 +0000)
lnet/autoconf/lustre-lnet.m4
lnet/klnds/viblnd/Makefile.in
lnet/klnds/viblnd/Makefile.mk
lnet/klnds/viblnd/viblnd.c
lnet/klnds/viblnd/viblnd.h
lnet/klnds/viblnd/viblnd_cb.c
lnet/klnds/viblnd/vibnal_sa.c [deleted file]
lnet/utils/portals.c

index 6780155..e9c5889 100644 (file)
@@ -1,4 +1,25 @@
 #
+# LP_CHECK_GCC_VERSION
+#
+# Check compiler version
+#
+AC_DEFUN([LP_CHECK_GCC_VERSION],
+[AC_MSG_CHECKING([compiler version])
+PTL_CC_VERSION=`$CC --version | awk '/^gcc/{print $ 3}'`
+PTL_MIN_CC_VERSION="3.2.3"
+v2n() {
+       awk -F. '{printf "%d\n", (($ 1)*100+($ 2))*100+($ 3)}'
+}
+if test -z "$PTL_CC_VERSION" -o \
+        `echo $PTL_CC_VERSION | v2n` -ge `echo $PTL_MIN_CC_VERSION | v2n`; then
+       AC_MSG_RESULT([ok])
+else
+       AC_MSG_RESULT([Buggy compiler found])
+       AC_MSG_ERROR([Need gcc version >= $PTL_MIN_CC_VERSION])
+fi
+])
+
+#
 # LP_CONFIG_ZEROCOPY
 #
 # check if zerocopy is available/wanted
@@ -242,29 +263,66 @@ AC_SUBST(IIBNAL)
 # check for Voltaire infiniband support
 #
 AC_DEFUN([LP_CONFIG_VIB],
-[AC_MSG_CHECKING([if Voltaire IB kernel headers are present])
-VIBCPPFLAGS="-I/usr/local/include/ibhost-kdevel -DCPU_BE=0 -DCPU_LE=1 -DGSI_PASS_PORT_NUM"
-EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
-EXTRA_KCFLAGS="$EXTRA_KCFLAGS $VIBCPPFLAGS"
-LB_LINUX_TRY_COMPILE([
-        #include <linux/list.h>
-       #include <vverbs.h>
-],[
-        vv_hca_h_t     kib_hca;
-       vv_return_t    retval;
-
-       retval = vv_hca_open("ANY_HCA", NULL, &kib_hca);
-
-       return retval == vv_return_ok ? 0 : 1;
-],[
-       AC_MSG_RESULT([yes])
-       VIBNAL="vibnal"
-],[
-       AC_MSG_RESULT([no])
+[AC_MSG_CHECKING([whether to enable Voltaire IB support])
+VIBPATH=""
+AC_ARG_WITH([vib],
+       AC_HELP_STRING([--with-vib=path],
+                      [build vibnal against path]),
+       [
+               case $with_vib in
+               no)     AC_MSG_RESULT([no]);;
+               *)      VIBPATH="${with_vib}/src/nvigor/ib-code"
+                       if test -d "$with_vib" -a -d "$VIBPATH"; then
+                               AC_MSG_RESULT([yes])
+                       else
+                               AC_MSG_RESULT([no])
+                               AC_MSG_ERROR([No directory $VIBPATH])
+                        fi;;
+               esac
+       ],[
+               AC_MSG_RESULT([no])
+       ])
+if test -z "$VIBPATH"; then
        VIBNAL=""
-       VIBCPPFLAGS=""
-])
-EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+else
+       VIBCPPFLAGS="-I${VIBPATH}/include -I${VIBPATH}/cm"
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS $VIBCPPFLAGS"
+       LB_LINUX_TRY_COMPILE([
+               #include <linux/list.h>
+               #include <asm/byteorder.h>
+               #ifdef __BIG_ENDIAN
+               # define CPU_BE 1
+                # define CPU_LE 0
+               #endif
+               #ifdef __LITTLE_ENDIAN
+               # define CPU_BE 0
+               # define CPU_LE 1
+               #endif
+               #include <vverbs.h>
+               #include <ib-cm.h>
+               #include <ibat.h>
+       ],[
+               vv_hca_h_t       kib_hca;
+               vv_return_t      vvrc;
+               cm_cep_handle_t  cep;
+               ibat_arp_data_t  arp_data;
+               ibat_stat_t      ibatrc;
+
+               vvrc = vv_hca_open("ANY_HCA", NULL, &kib_hca);
+               cep = cm_create_cep(cm_cep_transp_rc);
+               ibatrc = ibat_get_ib_data((uint32_t)0, (uint32_t)0,
+                                          ibat_paths_primary, &arp_data,
+                                         (ibat_get_ib_data_reply_fn_t)NULL,
+                                          NULL, 0);
+               return 0;
+       ],[
+               VIBNAL="vibnal"
+       ],[
+               AC_MSG_ERROR([can't compile vibnal with given path])
+       ])
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+fi
 AC_SUBST(VIBCPPFLAGS)
 AC_SUBST(VIBNAL)
 ])
@@ -411,13 +469,15 @@ fi
 # Portals linux kernel checks
 #
 AC_DEFUN([LP_PROG_LINUX],
-[LP_CONFIG_ZEROCOPY
+[LP_CHECK_GCC_VERSION
+
+LP_CONFIG_ZEROCOPY
 LP_CONFIG_AFFINITY
 LP_CONFIG_QUADRICS
 LP_CONFIG_GM
 LP_CONFIG_OPENIB
-LP_CONFIG_IIB
 LP_CONFIG_VIB
+LP_CONFIG_IIB
 LP_CONFIG_RANAL
 
 LP_STRUCT_PAGE_LIST
index fd7bb05..5287e70 100644 (file)
@@ -1,5 +1,5 @@
 MODULES := kvibnal
-kvibnal-objs := vibnal.o vibnal_cb.o vibnal_sa.o
+kvibnal-objs := vibnal.o vibnal_cb.o
 
 EXTRA_POST_CFLAGS := @VIBCPPFLAGS@
 
index d08633a..ffc1510 100644 (file)
@@ -6,5 +6,5 @@
 include $(src)/../../Kernelenv
 
 obj-y += kvibnal.o
-kvibnal-objs := vibnal.o vibnal_cb.o vibnal_sa.o
+kvibnal-objs := vibnal.o vibnal_cb.o
 
index 0c0a0e7..2cb4b7d 100644 (file)
 
 nal_t                   kibnal_api;
 ptl_handle_ni_t         kibnal_ni;
+kib_data_t              kibnal_data;
 kib_tunables_t          kibnal_tunables;
 
-kib_data_t              kibnal_data = {
-        .kib_service_id = IBNAL_SERVICE_NUMBER,
-};
-
 #ifdef CONFIG_SYSCTL
 #define IBNAL_SYSCTL             202
 
@@ -50,268 +47,330 @@ static ctl_table kibnal_top_ctl_table[] = {
 };
 #endif
 
-#ifdef unused
 void
-print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+kibnal_pause(int ticks)
 {
-        char name[32];
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        schedule_timeout(ticks);
+}
 
-        if (service == NULL) 
-        {
-                CWARN("tag       : %s\n"
-                      "status    : %d (NULL)\n", tag, rc);
-                return;
-        }
-        strncpy (name, service->ServiceName, sizeof(name)-1);
-        name[sizeof(name)-1] = 0;
-        
-        CWARN("tag       : %s\n"
-              "status    : %d\n"
-              "service id: "LPX64"\n"
-              "name      : %s\n"
-              "NID       : "LPX64"\n", tag, rc,
-              service->RID.ServiceID, name,
-              *kibnal_service_nid_field(service));
+__u32 
+kibnal_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+        /* ensure I don't return 0 (== no checksum) */
+        return (sum == 0) ? 1 : sum;
 }
-#endif
 
-/* 
- * method is SUBN_ADM_SET, SUBN_ADM_GET, SUBN_ADM_DELETE. Tables not supported.
- * nid is the nid to advertize/query/unadvertize
- */
-static void fill_sa_request(struct sa_request *request, int method, ptl_nid_t nid)
+void
+kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
 {
-        gsi_dtgrm_t *dtgrm = request->dtgrm_req;
-        sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad;
-        ib_service_record_v2_t *sr = (ib_service_record_v2_t *) mad->payload;
-        
-        memset(mad, 0, MAD_BLOCK_SIZE);
-
-        request->mad = mad;
-
-        dtgrm->rlid = kibnal_data.kib_port_attr.port_sma_address_info.sm_lid;
-        dtgrm->sl = kibnal_data.kib_port_attr.port_sma_address_info.service_level;
-
-        mad->hdr.base_ver = MAD_IB_BASE_VERSION;
-        mad->hdr.class = MAD_CLASS_SUBN_ADM;
-        mad->hdr.class_ver = 2;
-        mad->hdr.m.ms.method = method;
-        mad->hdr.attrib_id = SA_SERVICE_RECORD; /* something(?) will swap that field */
-
-               /* Note: the transaction ID is set by the Voltaire stack if it is 0. */
-
-        /* TODO: change the 40 to sizeof(something) */
-        mad->payload_len = cpu_to_be32(0x40 /*header size */  +
-                                       sizeof (ib_service_record_v2_t));
-
-
-        mad->component_mask = cpu_to_be64(
-                                          (1ull << 0)  |       /* service_id       */
-                                          (1ull << 2)  |       /* service_pkey     */
-                                          (1ull << 6)  |       /* service_name     */
-                                          (1ull << 7)  |       /* service_data8[0] */
-                                          (1ull << 8)  |       /* service_data8[1] */
-                                          (1ull << 9)  |       /* service_data8[2] */
-                                          (1ull << 10) |       /* service_data8[3] */
-                                          (1ull << 11) |       /* service_data8[4] */
-                                          (1ull << 12) |       /* service_data8[5] */
-                                          (1ull << 13) |       /* service_data8[6] */
-                                          (1ull << 14)      /* service_data8[7] */
-                                          );
-
-        sr->service_id = cpu_to_be64(kibnal_data.kib_service_id);
-        sr->service_pkey = cpu_to_be16(kibnal_data.kib_port_pkey);
-
-        /* Set the service name and the data (bytes 0 to 7) in data8 */
-        kibnal_set_service_keys(sr, nid);
-
-        if (method == SUBN_ADM_SET) {
-                mad->component_mask |= cpu_to_be64(
-                                                   (1ull << 1) |       /* service_gid       */
-                                                   (1ull << 4)         /* service_lease     */
-                                                   );
-
-                sr->service_gid = kibnal_data.kib_port_gid;
-                gid_swap(&sr->service_gid);
-                sr->service_lease = cpu_to_be32(0xffffffff);
-        }
-
-        CDEBUG(D_NET, "SA request %02x for service id "LPX64" %s:"LPX64"\n",
-               mad->hdr.m.ms.method,
-               sr->service_id, 
-               sr->service_name,
-               *kibnal_service_nid_field(sr));
+        msg->ibm_type = type;
+        msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
 }
 
-/* Do an advertizement operation: 
- *   SUBN_ADM_GET = 0x01 (i.e. query),
- *   SUBN_ADM_SET = 0x02 (i.e. advertize),
- *   SUBN_ADM_DELETE = 0x15 (i.e. un-advertize).
- * If callback is NULL, the function is synchronous (and context is ignored).
- */
-int kibnal_advertize_op(ptl_nid_t nid, int op, sa_request_cb_t callback, void *context)
+void
+kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, __u64 dststamp)
 {
-        struct sa_request *request;
-        int ret;
+        /* CAVEAT EMPTOR! all message fields not set here should have been
+         * initialised previously. */
+        msg->ibm_magic    = IBNAL_MSG_MAGIC;
+        msg->ibm_version  = IBNAL_MSG_VERSION;
+        /*   ibm_type */
+        msg->ibm_credits  = credits;
+        /*   ibm_nob */
+        msg->ibm_cksum    = 0;
+        msg->ibm_srcnid   = kibnal_lib.libnal_ni.ni_pid.nid;
+        msg->ibm_srcstamp = kibnal_data.kib_incarnation;
+        msg->ibm_dstnid   = dstnid;
+        msg->ibm_dststamp = dststamp;
+#if IBNAL_CKSUM
+        /* NB ibm_cksum zero while computing cksum */
+        msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
+#endif
+}
 
-        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+int
+kibnal_unpack_msg(kib_msg_t *msg, int nob)
+{
+        const int hdr_size = offsetof(kib_msg_t, ibm_u);
+        __u32     msg_cksum;
+        int       flip;
+        int       msg_nob;
+        int       i;
+        int       n;
+
+        /* 6 bytes are enough to have received magic + version */
+        if (nob < 6) {
+                CERROR("Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+                flip = 0;
+        } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
+                flip = 1;
+        } else {
+                CERROR("Bad magic: %08x\n", msg->ibm_magic);
+                return -EPROTO;
+        }
 
-        CDEBUG(D_NET, "kibnal_advertize_op: nid="LPX64", op=%d\n", nid, op);
+        if (msg->ibm_version != 
+            (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
+                CERROR("Bad version: %d\n", msg->ibm_version);
+                return -EPROTO;
+        }
 
-        request = alloc_sa_request();
-        if (request == NULL) {
-                CERROR("Cannot allocate a SA request");
-                return -ENOMEM;
+        if (nob < hdr_size) {
+                CERROR("Short message: %d\n", nob);
+                return -EPROTO;
         }
-                
-        fill_sa_request(request, op, nid);
 
-        if (callback) {
-                request->callback = callback;
-                request->context = context;
-        } else {
-                init_completion(&request->signal);
+        msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+        if (msg_nob > nob) {
+                CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+                return -EPROTO;
         }
 
-        ret = vibnal_start_sa_request(request);
-        if (ret) {
-                CERROR("vibnal_send_sa failed: %d\n", ret);
-                free_sa_request(request);
-        } else {
-                if (callback) {
-                        /* Return. The callback will have to free the SA request. */
-                        ret = 0;
-                } else {
-                        wait_for_completion(&request->signal);
+        /* checksum must be computed with ibm_cksum zero and BEFORE anything
+         * gets flipped */
+        msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+        msg->ibm_cksum = 0;
+        if (msg_cksum != 0 &&
+            msg_cksum != kibnal_cksum(msg, msg_nob)) {
+                CERROR("Bad checksum\n");
+                return -EPROTO;
+        }
+        msg->ibm_cksum = msg_cksum;
+        
+        if (flip) {
+                /* leave magic unflipped as a clue to peer endianness */
+                __swab16s(&msg->ibm_version);
+                CLASSERT (sizeof(msg->ibm_type) == 1);
+                CLASSERT (sizeof(msg->ibm_credits) == 1);
+                msg->ibm_nob = msg_nob;
+                __swab64s(&msg->ibm_srcnid);
+                __swab64s(&msg->ibm_srcstamp);
+                __swab64s(&msg->ibm_dstnid);
+                __swab64s(&msg->ibm_dststamp);
+        }
+        
+        if (msg->ibm_srcnid == PTL_NID_ANY) {
+                CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid);
+                return -EPROTO;
+        }
 
-                        ret = request->status;
+        switch (msg->ibm_type) {
+        default:
+                CERROR("Unknown message type %x\n", msg->ibm_type);
+                return -EPROTO;
+                
+        case IBNAL_MSG_NOOP:
+                break;
 
-                        if (ret != 0) {
-                                CERROR ("Error %d in advertising operation %d for NID "LPX64"\n",
-                                        ret, op, kibnal_data.kib_nid);
+        case IBNAL_MSG_IMMEDIATE:
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
+                        CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
+                        return -EPROTO;
+                }
+                break;
+
+        case IBNAL_MSG_PUT_REQ:
+                /* CAVEAT EMPTOR!  We don't actually put ibprm_rd on the wire;
+                 * it's just there to remember the source buffers while we wait
+                 * for the PUT_ACK */
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.putreq.ibprm_rd)) {
+                        CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
+                        return -EPROTO;
+                }
+                break;
+
+        case IBNAL_MSG_PUT_ACK:
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])) {
+                        CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0]));
+                        return -EPROTO;
+                }
+
+                if (flip) {
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
+                }
+                
+                n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
+                if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
+                        CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", 
+                               n, IBNAL_MAX_RDMA_FRAGS);
+                        return -EPROTO;
+                }
+                
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
+                        CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
+                        return -EPROTO;
+                }
+
+                if (flip)
+                        for (i = 0; i < n; i++) {
+                                __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
+                                __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo);
+                                __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi);
                         }
-                        
-                        free_sa_request(request);
+                break;
+
+        case IBNAL_MSG_GET_REQ:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
+                        CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.get)));
+                        return -EPROTO;
+                }
+                if (flip) {
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
                 }
-        }
 
-        return ret;
+                n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
+                if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
+                        CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", 
+                               n, IBNAL_MAX_RDMA_FRAGS);
+                        return -EPROTO;
+                }
+                
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
+                        CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
+                        return -EPROTO;
+                }
+                
+                if (flip)
+                        for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
+                                __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
+                                __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo);
+                                __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi);
+                        }
+                break;
+
+        case IBNAL_MSG_PUT_NAK:
+        case IBNAL_MSG_PUT_DONE:
+        case IBNAL_MSG_GET_DONE:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
+                        CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.completion)));
+                        return -EPROTO;
+                }
+                if (flip)
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
+                break;
+
+        case IBNAL_MSG_CONNREQ:
+        case IBNAL_MSG_CONNACK:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
+                        CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
+                        return -EPROTO;
+                }
+                if (flip) {
+                        __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
+                        __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+                        __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
+                }
+                break;
+        }
+        return 0;
 }
 
-static int
+int
 kibnal_set_mynid(ptl_nid_t nid)
 {
-        struct timeval tv;
-        lib_ni_t      *ni = &kibnal_lib.libnal_ni;
-        int            rc;
-        vv_return_t    retval;
+        static cm_listen_data_t info;           /* protected by kib_nid_mutex */
+
+        lib_ni_t        *ni = &kibnal_lib.libnal_ni;
+        int              rc;
+        cm_return_t      cmrc;
 
         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
                nid, ni->ni_pid.nid);
 
-        do_gettimeofday(&tv);
-
         down (&kibnal_data.kib_nid_mutex);
 
-        if (nid == kibnal_data.kib_nid) {
+        if (nid == ni->ni_pid.nid) {
                 /* no change of NID */
                 up (&kibnal_data.kib_nid_mutex);
                 return (0);
         }
 
-        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
-               kibnal_data.kib_nid, nid);
-
-        /* Unsubscribes the current NID */
-        if (kibnal_data.kib_nid != PTL_NID_ANY) {
+        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_pid.nid, nid);
 
-                rc = kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_DELETE, NULL, NULL);
+        if (kibnal_data.kib_listen_handle != NULL) {
+                cmrc = cm_cancel(kibnal_data.kib_listen_handle);
+                if (cmrc != cm_stat_success)
+                        CERROR ("Error %d stopping listener\n", cmrc);
 
-                if (rc) {
-                        CERROR("Error %d unadvertising NID "LPX64"\n",
-                               rc, kibnal_data.kib_nid);
-                }
-        }
+                kibnal_pause(HZ/10);            /* ensure no more callbacks */
         
-        kibnal_data.kib_nid = ni->ni_pid.nid = nid;
-        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+                cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
+                if (cmrc != vv_return_ok)
+                        CERROR ("Error %d destroying CEP\n", cmrc);
 
-        /* Destroys the current endpoint, if any. */
-        if (kibnal_data.kib_cep) {
-                retval = cm_cancel(kibnal_data.kib_cep);
-                if (retval)
-                        CERROR ("Error %d stopping listener\n", retval);
-        
-                retval = cm_destroy_cep(kibnal_data.kib_cep);
-                if (retval)
-                        CERROR ("Error %d destroying CEP\n", retval);
-        
-                kibnal_data.kib_cep = NULL;
+                kibnal_data.kib_listen_handle = NULL;
         }
-        
+
+        /* Change NID.  NB queued passive connection requests (if any) will be
+         * rejected with an incorrect destination NID */
+        ni->ni_pid.nid = nid;
+        kibnal_data.kib_incarnation++;
+        mb();
+
         /* Delete all existing peers and their connections after new
          * NID/incarnation set to ensure no old connections in our brave
          * new world. */
         kibnal_del_peer (PTL_NID_ANY, 0);
 
-        if (kibnal_data.kib_nid == PTL_NID_ANY) {
-                /* No new NID to install. The driver is shuting down. */
-                up (&kibnal_data.kib_nid_mutex);
-                return (0);
-        }
-
-        /* remove any previous advert (crashed node etc) */
-        kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_DELETE, NULL, NULL);
-
-        kibnal_data.kib_cep = cm_create_cep(cm_cep_transp_rc);
-        if (kibnal_data.kib_cep == NULL) {
-                CERROR ("Can't create CEP\n");
-                rc = -ENOMEM;
-        } else {
-                cm_return_t cmret;
-                cm_listen_data_t info;
+        if (ni->ni_pid.nid != PTL_NID_ANY) {    /* got a new NID to install */
+                kibnal_data.kib_listen_handle = 
+                        cm_create_cep(cm_cep_transp_rc);
+                if (kibnal_data.kib_listen_handle == NULL) {
+                        CERROR ("Can't create listen CEP\n");
+                        rc = -ENOMEM;
+                        goto failed_0;
+                }
 
-                CDEBUG(D_NET, "Created CEP %p for listening\n", kibnal_data.kib_cep);
+                CDEBUG(D_NET, "Created CEP %p for listening\n", 
+                       kibnal_data.kib_listen_handle);
 
                 memset(&info, 0, sizeof(info));
-                info.listen_addr.end_pt.sid = kibnal_data.kib_service_id;
+                info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id;
 
-                cmret = cm_listen(kibnal_data.kib_cep, &info,
-                                  kibnal_listen_callback, NULL);
-                if (cmret) {
-                        CERROR ("cm_listen error: %d\n", cmret);
+                cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
+                                 kibnal_listen_callback, NULL);
+                if (cmrc != 0) {
+                        CERROR ("cm_listen error: %d\n", cmrc);
                         rc = -EINVAL;
-                } else {
-                        rc = 0;
+                        goto failed_1;
                 }
         }
-        
-        if (rc == 0) {
-                rc = kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_SET, NULL, NULL);
-                if (rc == 0) {
-#ifdef IBNAL_CHECK_ADVERT
-                        kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_GET, NULL, NULL);
-#endif
-                        up (&kibnal_data.kib_nid_mutex);
-                        return (0);
-                }
-                
-                retval = cm_cancel (kibnal_data.kib_cep);
-                if (retval)
-                        CERROR("cm_cancel failed: %d\n", retval);
 
-                retval = cm_destroy_cep (kibnal_data.kib_cep);
-                if (retval)
-                        CERROR("cm_destroy_cep failed: %d\n", retval);
-
-                /* remove any peers that sprung up while I failed to
-                 * advertise myself */
-                kibnal_del_peer (PTL_NID_ANY, 0);
-        }
+        up (&kibnal_data.kib_nid_mutex);
+        return (0);
 
-        kibnal_data.kib_nid = PTL_NID_ANY;
+ failed_1:
+        cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
+        LASSERT (cmrc == cm_stat_success);
+        kibnal_data.kib_listen_handle = NULL;
+ failed_0:
+        ni->ni_pid.nid = PTL_NID_ANY;
+        kibnal_data.kib_incarnation++;
+        mb();
+        kibnal_del_peer (PTL_NID_ANY, 0);
         up (&kibnal_data.kib_nid_mutex);
-        return (rc);
+        return rc;
 }
 
 kib_peer_t *
@@ -340,7 +399,12 @@ kibnal_create_peer (ptl_nid_t nid)
         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
 
         atomic_inc (&kibnal_data.kib_npeers);
-        return (peer);
+        if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS)
+                return peer;
+        
+        CERROR("Too many peers: CQ will overflow\n");
+        kibnal_peer_decref(peer);
+        return NULL;
 }
 
 void
@@ -390,21 +454,6 @@ kibnal_find_peer_locked (ptl_nid_t nid)
         return (NULL);
 }
 
-kib_peer_t *
-kibnal_get_peer (ptl_nid_t nid)
-{
-        kib_peer_t     *peer;
-        unsigned long   flags;
-
-        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
-        peer = kibnal_find_peer_locked (nid);
-        if (peer != NULL)                       /* +1 ref for caller? */
-                kib_peer_addref(peer);
-        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
-
-        return (peer);
-}
-
 void
 kibnal_unlink_peer_locked (kib_peer_t *peer)
 {
@@ -414,16 +463,17 @@ kibnal_unlink_peer_locked (kib_peer_t *peer)
         LASSERT (kibnal_peer_active(peer));
         list_del_init (&peer->ibp_list);
         /* lose peerlist's ref */
-        kib_peer_decref(peer);
+        kibnal_peer_decref(peer);
 }
 
-static int
-kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+int
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp,
+                      int *persistencep)
 {
         kib_peer_t        *peer;
         struct list_head  *ptmp;
-        unsigned long      flags;
         int                i;
+        unsigned long      flags;
 
         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
@@ -440,6 +490,7 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
                                 continue;
 
                         *nidp = peer->ibp_nid;
+                        *ipp = peer->ibp_ip;
                         *persistencep = peer->ibp_persistence;
 
                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
@@ -452,12 +503,14 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
         return (-ENOENT);
 }
 
-static int
-kibnal_add_persistent_peer (ptl_nid_t nid)
+int
+kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip)
 {
-        unsigned long      flags;
         kib_peer_t        *peer;
         kib_peer_t        *peer2;
+        unsigned long      flags;
+
+        CDEBUG(D_NET, LPX64"@%08x\n", nid, ip);
         
         if (nid == PTL_NID_ANY)
                 return (-EINVAL);
@@ -466,11 +519,11 @@ kibnal_add_persistent_peer (ptl_nid_t nid)
         if (peer == NULL)
                 return (-ENOMEM);
 
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
         peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
-                kib_peer_decref (peer);
+                kibnal_peer_decref (peer);
                 peer = peer2;
         } else {
                 /* peer table takes existing ref on peer */
@@ -478,13 +531,14 @@ kibnal_add_persistent_peer (ptl_nid_t nid)
                                kibnal_nid2peerlist (nid));
         }
 
+        peer->ibp_ip = ip;
         peer->ibp_persistence++;
         
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
         return (0);
 }
 
-static void
+void
 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
 {
         struct list_head *ctmp;
@@ -517,16 +571,16 @@ kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
 int
 kibnal_del_peer (ptl_nid_t nid, int single_share)
 {
-        unsigned long      flags;
         struct list_head  *ptmp;
         struct list_head  *pnxt;
         kib_peer_t        *peer;
         int                lo;
         int                hi;
         int                i;
+        unsigned long      flags;
         int                rc = -ENOENT;
 
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
         if (nid != PTL_NID_ANY)
                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
@@ -553,20 +607,19 @@ kibnal_del_peer (ptl_nid_t nid, int single_share)
                 }
         }
  out:
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
-
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
         return (rc);
 }
 
-static kib_conn_t *
+kib_conn_t *
 kibnal_get_conn_by_idx (int index)
 {
         kib_peer_t        *peer;
         struct list_head  *ptmp;
         kib_conn_t        *conn;
         struct list_head  *ctmp;
-        unsigned long      flags;
         int                i;
+        unsigned long      flags;
 
         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
@@ -583,10 +636,7 @@ kibnal_get_conn_by_idx (int index)
                                         continue;
 
                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
-                                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                                       atomic_read (&conn->ibc_refcount));
-                                atomic_inc (&conn->ibc_refcount);
+                                kibnal_conn_addref(conn);
                                 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
                                                        flags);
                                 return (conn);
@@ -598,19 +648,124 @@ kibnal_get_conn_by_idx (int index)
         return (NULL);
 }
 
+int
+kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
+{
+        static vv_qp_attr_t attr;
+        
+        kib_connvars_t   *cv = conn->ibc_connvars;
+        vv_return_t       vvrc;
+        
+        /* Only called by connd => static OK */
+        LASSERT (!in_interrupt());
+        LASSERT (current == kibnal_data.kib_connd);
+
+        memset(&attr, 0, sizeof(attr));
+        
+        switch (new_state) {
+        default:
+                LBUG();
+                
+        case vv_qp_state_init: {
+                struct vv_qp_modify_init_st *init = &attr.modify.params.init;
+
+                init->p_key_indx     = cv->cv_pkey_index;
+                init->phy_port_num   = cv->cv_port;
+                init->q_key          = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */
+                init->access_control = vv_acc_r_mem_read |
+                                       vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */
+
+                attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX | 
+                                              VV_QP_AT_PHY_PORT_NUM |
+                                              VV_QP_AT_ACCESS_CON_F;
+                break;
+        }
+        case vv_qp_state_rtr: {
+                struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr;
+                vv_add_vec_t               *av  = &rtr->remote_add_vec;
+
+                av->dlid                      = cv->cv_path.dlid;
+                av->grh_flag                  = (!IBNAL_LOCAL_SUB);
+                av->max_static_rate           = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate);
+                av->service_level             = cv->cv_path.sl;
+                av->source_path_bit           = IBNAL_SOURCE_PATH_BIT;
+                av->pmtu                      = cv->cv_path.mtu;
+                av->rnr_retry_count           = cv->cv_rnr_count;
+                av->global_dest.traffic_class = cv->cv_path.traffic_class;
+                av->global_dest.hope_limit    = cv->cv_path.hop_limut;
+                av->global_dest.flow_lable    = cv->cv_path.flow_label;
+                av->global_dest.s_gid_index   = cv->cv_sgid_index;
+                // XXX other av fields zero?
+
+                rtr->destanation_qp            = cv->cv_remote_qpn;
+                rtr->receive_psn               = cv->cv_rxpsn;
+                rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD;
+
+                // XXX ? rtr->opt_min_rnr_nak_timer = 16;
+
+
+                // XXX sdp sets VV_QP_AT_OP_F but no actual optional options
+                attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC | 
+                                              VV_QP_AT_DEST_QP |
+                                              VV_QP_AT_R_PSN | 
+                                              VV_QP_AT_MIN_RNR_NAK_T |
+                                              VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
+                                              VV_QP_AT_OP_F;
+                break;
+        }
+        case vv_qp_state_rts: {
+                struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts;
+
+                rts->send_psn                 = cv->cv_txpsn;
+                rts->local_ack_timeout        = IBNAL_LOCAL_ACK_TIMEOUT;
+                rts->retry_num                = IBNAL_RETRY_CNT;
+                rts->rnr_num                  = IBNAL_RNR_CNT;
+                rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD;
+                
+                attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN |
+                                              VV_QP_AT_L_ACK_T |
+                                              VV_QP_AT_RETRY_NUM |
+                                              VV_QP_AT_RNR_NUM |
+                                              VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM;
+                break;
+        }
+        case vv_qp_state_error:
+        case vv_qp_state_reset:
+                attr.modify.vv_qp_attr_mask = 0;
+                break;
+        }
+                
+        attr.modify.qp_modify_into_state = new_state;
+        attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE;
+        
+        vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL);
+        if (vvrc != vv_return_ok) {
+                CERROR("Can't modify qp -> "LPX64" state to %d: %d\n", 
+                       conn->ibc_peer->ibp_nid, new_state, vvrc);
+                return -EIO;
+        }
+        
+        return 0;
+}
+
 kib_conn_t *
-kibnal_create_conn (void)
+kibnal_create_conn (cm_cep_handle_t cep)
 {
-        kib_conn_t  *conn;
-        int          i;
-        __u64        vaddr = 0;
-        __u64        vaddr_base;
-        int          page_offset;
-        int          ipage;
-        vv_qp_attr_t qp_attr;
-        vv_return_t  retval;
-        int          rc;
-        void        *qp_context;
+        kib_conn_t   *conn;
+        int           i;
+        __u64         vaddr = 0;
+        __u64         vaddr_base;
+        int           page_offset;
+        int           ipage;
+        vv_return_t   vvrc;
+        int           rc;
+
+        static vv_qp_attr_t  reqattr;
+        static vv_qp_attr_t  rspattr;
+
+        /* Only the connd creates conns => single threaded */
+        LASSERT(!in_interrupt());
+        LASSERT(current == kibnal_data.kib_connd);
         
         PORTAL_ALLOC(conn, sizeof (*conn));
         if (conn == NULL) {
@@ -621,6 +776,7 @@ kibnal_create_conn (void)
         /* zero flags, NULL pointers etc... */
         memset (conn, 0, sizeof (*conn));
 
+        INIT_LIST_HEAD (&conn->ibc_early_rxs);
         INIT_LIST_HEAD (&conn->ibc_tx_queue);
         INIT_LIST_HEAD (&conn->ibc_active_txs);
         spin_lock_init (&conn->ibc_lock);
@@ -628,6 +784,18 @@ kibnal_create_conn (void)
         atomic_inc (&kibnal_data.kib_nconns);
         /* well not really, but I call destroy() on failure, which decrements */
 
+        conn->ibc_cep = cep;
+
+        PORTAL_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+        if (conn->ibc_connvars == NULL) {
+                CERROR("Can't allocate in-progress connection state\n");
+                goto failed;
+        }
+        memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars));
+        /* Random seed for QP sequence number */
+        get_random_bytes(&conn->ibc_connvars->cv_rxpsn,
+                         sizeof(conn->ibc_connvars->cv_rxpsn));
+
         PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
         if (conn->ibc_rxs == NULL) {
                 CERROR("Cannot allocate RX buffers\n");
@@ -649,26 +817,27 @@ kibnal_create_conn (void)
                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
                              page_offset);
 
-                if (kibnal_whole_mem()) {
-                        void *newaddr;
-                        vv_mem_reg_h_t mem_h;
-                        vv_r_key_t r_key;
+#if IBNAL_WHOLE_MEM
+                {
+                        vv_mem_reg_h_t  mem_h;
+                        vv_r_key_t      r_key;
 
                         /* Voltaire stack already registers the whole
                          * memory, so use that API. */
-                        retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
-                                                      rx->rx_msg,
-                                                      IBNAL_MSG_SIZE,
-                                                      &mem_h,
-                                                      &rx->l_key,
-                                                      &r_key);
-                        if (retval) {
-                                CERROR("vv_get_gen_mr_attrib failed: %d", retval);
-                                /* TODO: free pages? */
-                                goto failed;
-                        }
+                        vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+                                                    rx->rx_msg,
+                                                    IBNAL_MSG_SIZE,
+                                                    &mem_h,
+                                                    &rx->rx_lkey,
+                                                    &r_key);
+                        LASSERT (vvrc == vv_return_ok);
                 }
-                
+#else
+                rx->rx_vaddr = vaddr;
+#endif                
+                CDEBUG(D_NET, "Rx[%d] %p->%p[%x:"LPX64"]\n", i, rx, 
+                       rx->rx_msg, KIBNAL_RX_LKEY(rx), KIBNAL_RX_VADDR(rx));
+
                 vaddr += IBNAL_MSG_SIZE;
                 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
                 
@@ -682,47 +851,40 @@ kibnal_create_conn (void)
                 }
         }
 
-        qp_attr = (vv_qp_attr_t) {
-                .create.qp_type          = vv_qp_type_r_conn,
-                .create.cq_send_h        = kibnal_data.kib_cq,
-                .create.cq_receive_h     = kibnal_data.kib_cq,
-                .create.send_max_outstand_wr = IBNAL_TX_MAX_SG * 
-                                           IBNAL_MSG_QUEUE_SIZE,
-                .create.receive_max_outstand_wr = IBNAL_MSG_QUEUE_SIZE,
-                .create.max_scatgat_per_send_wr = 1,
-                .create.max_scatgat_per_receive_wr = 1,
-                .create.signaling_type   = vv_selectable_signaling, /* TODO: correct? */
-                .create.pd_h             = kibnal_data.kib_pd,
-                .create.recv_solicited_events = vv_signal_all,
-        };
-        retval = vv_qp_create(kibnal_data.kib_hca, &qp_attr, NULL,
-                              &conn->ibc_qp, &conn->ibc_qp_attrs);
-        if (retval != 0) {
-                CERROR ("Failed to create queue pair: %d\n", retval);
+        memset(&reqattr, 0, sizeof(reqattr));
+
+        reqattr.create.qp_type                    = vv_qp_type_r_conn;
+        reqattr.create.cq_send_h                  = kibnal_data.kib_cq;
+        reqattr.create.cq_receive_h               = kibnal_data.kib_cq;
+        reqattr.create.send_max_outstand_wr       = (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                                                    IBNAL_MSG_QUEUE_SIZE;
+        reqattr.create.receive_max_outstand_wr    = IBNAL_RX_MSGS;
+        reqattr.create.max_scatgat_per_send_wr    = 1;
+        reqattr.create.max_scatgat_per_receive_wr = 1;
+        reqattr.create.signaling_type             = vv_selectable_signaling;
+        reqattr.create.pd_h                       = kibnal_data.kib_pd;
+        reqattr.create.recv_solicited_events      = vv_selectable_signaling; // vv_signal_all;
+
+        vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL,
+                            &conn->ibc_qp, &rspattr);
+        if (vvrc != vv_return_ok) {
+                CERROR ("Failed to create queue pair: %d\n", vvrc);
                 goto failed;
         }
 
         /* Mark QP created */
-        conn->ibc_state = IBNAL_CONN_INIT_QP;
-
-        qp_attr = (vv_qp_attr_t) {
-                .modify.qp_modify_into_state = vv_qp_state_init,
-                .modify.vv_qp_attr_mask      = VV_QP_AT_STATE | VV_QP_AT_PHY_PORT_NUM | VV_QP_AT_P_KEY_IX | VV_QP_AT_ACCESS_CON_F,
-                .modify.qp_type              = vv_qp_type_r_conn,
-
-                .modify.params.init.p_key_indx      = 0,
-                .modify.params.init.phy_port_num    = kibnal_data.kib_port,
-                .modify.params.init.access_control  = vv_acc_r_mem_write | vv_acc_r_mem_read,
-        };
-        retval = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &qp_attr, &conn->ibc_qp_attrs);
-        if (retval != 0) {
-                CERROR ("Failed to modify queue pair: %d\n", retval);
-                goto failed;
-        }
-
-        retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs);
-        if (retval) {
-                CERROR ("Failed to query queue pair: %d\n", retval);
+        conn->ibc_state = IBNAL_CONN_INIT;
+        conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num;
+
+        if (rspattr.create_return.receive_max_outstand_wr < 
+            IBNAL_MSG_QUEUE_SIZE ||
+            rspattr.create_return.send_max_outstand_wr < 
+            (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE) {
+                CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n",
+                       IBNAL_MSG_QUEUE_SIZE, 
+                       (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE,
+                       rspattr.create_return.receive_max_outstand_wr,
+                       rspattr.create_return.send_max_outstand_wr);
                 goto failed;
         }
 
@@ -738,91 +900,63 @@ kibnal_create_conn (void)
 void
 kibnal_destroy_conn (kib_conn_t *conn)
 {
-        vv_return_t retval;
+        vv_return_t vvrc;
+
+        /* Only the connd does this (i.e. single threaded) */
+        LASSERT (!in_interrupt());
+        LASSERT (current == kibnal_data.kib_connd);
         
         CDEBUG (D_NET, "connection %p\n", conn);
 
         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+        LASSERT (list_empty(&conn->ibc_early_rxs));
         LASSERT (list_empty(&conn->ibc_tx_queue));
         LASSERT (list_empty(&conn->ibc_active_txs));
         LASSERT (conn->ibc_nsends_posted == 0);
-        LASSERT (conn->ibc_connreq == NULL);
 
         switch (conn->ibc_state) {
+        default:
+                /* conn must be completely disengaged from the network */
+                LBUG();
+
         case IBNAL_CONN_DISCONNECTED:
-                /* called after connection sequence initiated */
+                /* connvars should have been freed already */
+                LASSERT (conn->ibc_connvars == NULL);
                 /* fall through */
 
-        case IBNAL_CONN_INIT_QP:
-                /* _destroy includes an implicit Reset of the QP which 
-                 * discards posted work */
-                retval = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp);
-                if (retval)
-                        CERROR("Can't destroy QP: %d\n", retval);
+        case IBNAL_CONN_INIT:
+                kibnal_set_qp_state(conn, vv_qp_state_reset);
+                vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp);
+                if (vvrc != vv_return_ok)
+                        CERROR("Can't destroy QP: %d\n", vvrc);
                 /* fall through */
                 
         case IBNAL_CONN_INIT_NOTHING:
                 break;
-
-        default:
-                LASSERT (0);
-        }
-
-        if (conn->ibc_cep != NULL) {
-                retval = cm_destroy_cep(conn->ibc_cep);
-                if (retval)
-                        CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, 
-                               retval);
         }
 
         if (conn->ibc_rx_pages != NULL) 
                 kibnal_free_pages(conn->ibc_rx_pages);
-        
+
         if (conn->ibc_rxs != NULL)
                 PORTAL_FREE(conn->ibc_rxs, 
                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
+        if (conn->ibc_connvars != NULL)
+                PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
         if (conn->ibc_peer != NULL)
-                kib_peer_decref(conn->ibc_peer);
+                kibnal_peer_decref(conn->ibc_peer);
+
+        vvrc = cm_destroy_cep(conn->ibc_cep);
+        LASSERT (vvrc == vv_return_ok);
 
         PORTAL_FREE(conn, sizeof (*conn));
 
         atomic_dec(&kibnal_data.kib_nconns);
-        
-        if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
-            kibnal_data.kib_shutdown) {
-                /* I just nuked the last connection on shutdown; wake up
-                 * everyone so they can exit. */
-                wake_up_all(&kibnal_data.kib_sched_waitq);
-                wake_up_all(&kibnal_data.kib_connd_waitq);
-        }
 }
 
-void
-kibnal_put_conn (kib_conn_t *conn)
-{
-        unsigned long flags;
-
-        CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
-                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                atomic_read (&conn->ibc_refcount));
-
-        LASSERT (atomic_read (&conn->ibc_refcount) > 0);
-        if (!atomic_dec_and_test (&conn->ibc_refcount))
-                return;
-
-        /* must disconnect before dropping the final ref */
-        LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
-
-        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
-
-        list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
-        wake_up (&kibnal_data.kib_connd_waitq);
-
-        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
-}
-
-static int
+int
 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
 {
         kib_conn_t         *conn;
@@ -864,19 +998,19 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
         return (count);
 }
 
-static int
+int
 kibnal_close_matching_conns (ptl_nid_t nid)
 {
-        unsigned long       flags;
         kib_peer_t         *peer;
         struct list_head   *ptmp;
         struct list_head   *pnxt;
         int                 lo;
         int                 hi;
         int                 i;
+        unsigned long       flags;
         int                 count = 0;
 
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
         if (nid != PTL_NID_ANY)
                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
@@ -900,7 +1034,7 @@ kibnal_close_matching_conns (ptl_nid_t nid)
                 }
         }
 
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
         /* wildcards always succeed */
         if (nid == PTL_NID_ANY)
@@ -909,31 +1043,32 @@ kibnal_close_matching_conns (ptl_nid_t nid)
         return (count == 0 ? -ENOENT : 0);
 }
 
-static int
+int
 kibnal_cmd(struct portals_cfg *pcfg, void * private)
 {
         int rc = -EINVAL;
-        ENTRY;
 
         LASSERT (pcfg != NULL);
 
         switch(pcfg->pcfg_command) {
         case NAL_CMD_GET_PEER: {
                 ptl_nid_t   nid = 0;
+                __u32       ip = 0;
                 int         share_count = 0;
 
                 rc = kibnal_get_peer_info(pcfg->pcfg_count,
-                                          &nid, &share_count);
+                                          &nid, &ip, &share_count);
                 pcfg->pcfg_nid   = nid;
                 pcfg->pcfg_size  = 0;
-                pcfg->pcfg_id    = 0;
-                pcfg->pcfg_misc  = 0;
+                pcfg->pcfg_id    = ip;
+                pcfg->pcfg_misc  = IBNAL_SERVICE_NUMBER; /* port */
                 pcfg->pcfg_count = 0;
                 pcfg->pcfg_wait  = share_count;
                 break;
         }
         case NAL_CMD_ADD_PEER: {
-                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
+                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
+                                                 pcfg->pcfg_id); /* IP */
                 break;
         }
         case NAL_CMD_DEL_PEER: {
@@ -953,7 +1088,7 @@ kibnal_cmd(struct portals_cfg *pcfg, void * private)
                         pcfg->pcfg_id    = 0;
                         pcfg->pcfg_misc  = 0;
                         pcfg->pcfg_flags = 0;
-                        kibnal_put_conn (conn);
+                        kibnal_conn_decref(conn);
                 }
                 break;
         }
@@ -970,20 +1105,21 @@ kibnal_cmd(struct portals_cfg *pcfg, void * private)
         }
         }
 
-        RETURN(rc);
+        return rc;
 }
 
 void
 kibnal_free_pages (kib_pages_t *p)
 {
-        int     npages = p->ibp_npages;
-        vv_return_t retval;
-        int     i;
+        int         npages = p->ibp_npages;
+        vv_return_t vvrc;
+        int         i;
         
         if (p->ibp_mapped) {
-                retval = vv_mem_region_destroy(kibnal_data.kib_hca, p->ibp_handle);
-                if (retval != 0)
-                        CERROR ("Deregister error: %d\n", retval);
+                vvrc = vv_mem_region_destroy(kibnal_data.kib_hca, 
+                                             p->ibp_handle);
+                if (vvrc != vv_return_ok)
+                        CERROR ("Deregister error: %d\n", vvrc);
         }
         
         for (i = 0; i < npages; i++)
@@ -997,10 +1133,13 @@ int
 kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
 {
         kib_pages_t   *p;
-        vv_phy_list_t  phys_pages;
-        vv_phy_buf_t  *phys_buf;
         int            i;
-        vv_return_t    retval;
+#if !IBNAL_WHOLE_MEM
+        vv_phy_list_t            vv_phys;
+        vv_phy_buf_t            *phys_pages;
+        vv_return_t              vvrc;
+        vv_access_con_bit_mask_t access;
+#endif
 
         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
@@ -1020,57 +1159,124 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
                 }
         }
 
-        if (kibnal_whole_mem())
-                goto out;
-
-        PORTAL_ALLOC(phys_buf, npages * sizeof(vv_phy_buf_t));
-        if (phys_buf == NULL) {
-                CERROR ("Can't allocate phys_buf for %d pages\n", npages);
-                /* XXX free ibp_pages? */
+#if !IBNAL_WHOLE_MEM
+        PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
+        if (phys_pages == NULL) {
+                CERROR ("Can't allocate physarray for %d pages\n", npages);
                 kibnal_free_pages(p);
                 return (-ENOMEM);
         }
 
-        phys_pages.number_of_buff = npages;
-        phys_pages.phy_list = phys_buf;
+        vv_phys.number_of_buff = npages;
+        vv_phys.phy_list = phys_pages;
 
-        /* if we were using the _contig_ registration variant we would have
-         * an array of PhysAddr/Length pairs, but the discontiguous variant
-         * just takes the PhysAddr */
         for (i = 0; i < npages; i++) {
-                phys_buf[i].start = kibnal_page2phys(p->ibp_pages[i]);
-                phys_buf[i].size = PAGE_SIZE;
-        }
-
-        retval = vv_phy_mem_region_register(kibnal_data.kib_hca,
-                                            &phys_pages,
-                                            0, /* requested vaddr */
-                                            npages * PAGE_SIZE,
-                                            0, /* offset */
-                                            kibnal_data.kib_pd,
-                                            vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind, /* TODO: translated as-is, but seems incorrect or too much */
-                                            &p->ibp_handle, &p->ibp_vaddr,                                           
-                                            &p->ibp_lkey, &p->ibp_rkey);
+                phys_pages[i].size = PAGE_SIZE;
+                phys_pages[i].start = 
+                        kibnal_page2phys(p->ibp_pages[i]);
+        }
+
+        VV_ACCESS_CONTROL_MASK_SET_ALL(access);
+        
+        vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
+                                          &vv_phys,
+                                          0, /* requested vaddr */
+                                          npages * PAGE_SIZE, 0, /* offset */
+                                          kibnal_data.kib_pd,
+                                          access,
+                                          &p->ibp_handle, 
+                                          &p->ibp_vaddr,                                           
+                                          &p->ibp_lkey, 
+                                          &p->ibp_rkey);
         
-        PORTAL_FREE(phys_buf, npages * sizeof(vv_phy_buf_t));
+        PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
         
-        if (retval) {
-                CERROR ("Error %d mapping %d pages\n", retval, npages);
+        if (vvrc != vv_return_ok) {
+                CERROR ("Error %d mapping %d pages\n", vvrc, npages);
                 kibnal_free_pages(p);
-                return (-ENOMEM);
+                return (-EFAULT);
         }
 
         CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" "
-                      "lkey %x rkey %x\n", npages, p->ibp_handle,
-                      p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
+               "lkey %x rkey %x\n", npages, p->ibp_handle,
+               p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
         
         p->ibp_mapped = 1;
-out:
+#endif
         *pp = p;
         return (0);
 }
 
-static int
+int
+kibnal_alloc_tx_descs (void) 
+{
+        int    i;
+        
+        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        if (kibnal_data.kib_tx_descs == NULL)
+                return -ENOMEM;
+        
+        memset(kibnal_data.kib_tx_descs, 0,
+               IBNAL_TX_MSGS * sizeof(kib_tx_t));
+
+        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+                kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
+
+                PORTAL_ALLOC(tx->tx_wrq, 
+                             (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                             sizeof(*tx->tx_wrq));
+                if (tx->tx_wrq == NULL)
+                        return -ENOMEM;
+                
+                PORTAL_ALLOC(tx->tx_gl, 
+                             (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                             sizeof(*tx->tx_gl));
+                if (tx->tx_gl == NULL)
+                        return -ENOMEM;
+                
+                PORTAL_ALLOC(tx->tx_rd, 
+                             offsetof(kib_rdma_desc_t, 
+                                      rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+                if (tx->tx_rd == NULL)
+                        return -ENOMEM;
+        }
+
+        return 0;
+}
+
+void
+kibnal_free_tx_descs (void) 
+{
+        int    i;
+
+        if (kibnal_data.kib_tx_descs == NULL)
+                return;
+
+        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+                kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
+
+                if (tx->tx_wrq != NULL)
+                        PORTAL_FREE(tx->tx_wrq, 
+                                    (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                                    sizeof(*tx->tx_wrq));
+
+                if (tx->tx_gl != NULL)
+                        PORTAL_FREE(tx->tx_gl, 
+                                    (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                                    sizeof(*tx->tx_gl));
+
+                if (tx->tx_rd != NULL)
+                        PORTAL_FREE(tx->tx_rd, 
+                                    offsetof(kib_rdma_desc_t, 
+                                             rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+        }
+
+        PORTAL_FREE(kibnal_data.kib_tx_descs,
+                    IBNAL_TX_MSGS * sizeof(kib_tx_t));
+}
+
+int
 kibnal_setup_tx_descs (void)
 {
         int           ipage = 0;
@@ -1083,10 +1289,10 @@ kibnal_setup_tx_descs (void)
         int           rc;
 
         /* pre-mapped messages are not bigger than 1 page */
-        LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+        CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
 
         /* No fancy arithmetic when we do the buffer calculations */
-        LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+        CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
 
         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
                                 0);
@@ -1100,35 +1306,32 @@ kibnal_setup_tx_descs (void)
                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
                 tx = &kibnal_data.kib_tx_descs[i];
 
-                memset (tx, 0, sizeof(*tx));    /* zero flags etc */
-                
                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
                                            page_offset);
-
-                if (kibnal_whole_mem()) {
-                        void *newaddr;
-                        vv_mem_reg_h_t mem_h;
-                        vv_return_t  retval;
+#if IBNAL_WHOLE_MEM
+                {
+                        vv_mem_reg_h_t  mem_h;
+                        vv_r_key_t      rkey;
+                        vv_return_t     vvrc;
 
                         /* Voltaire stack already registers the whole
                          * memory, so use that API. */
-                        retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
-                                                      tx->tx_msg,
-                                                      IBNAL_MSG_SIZE,
-                                                      &mem_h,
-                                                      &tx->l_key,
-                                                      &tx->r_key);
-                        if (retval) {
-                                CERROR("vv_get_gen_mr_attrib failed: %d", retval);
-                                /* TODO: free pages? */
-                                /* TODO: return. */
-                        }
+                        vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+                                                    tx->tx_msg,
+                                                    IBNAL_MSG_SIZE,
+                                                    &mem_h,
+                                                    &tx->tx_lkey,
+                                                    &rkey);
+                        LASSERT (vvrc == vv_return_ok);
                 }
-
+#else
+                tx->tx_vaddr = vaddr;
+#endif
                 tx->tx_isnblk = (i >= IBNAL_NTX);
                 tx->tx_mapped = KIB_TX_UNMAPPED;
 
-                CDEBUG(D_NET, "Tx[%d] %p->%p\n", i, tx, tx->tx_msg);
+                CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx, 
+                       tx->tx_msg, KIBNAL_TX_LKEY(tx), KIBNAL_TX_VADDR(tx));
 
                 if (tx->tx_isnblk)
                         list_add (&tx->tx_list, 
@@ -1153,12 +1356,11 @@ kibnal_setup_tx_descs (void)
         return (0);
 }
 
-static void
+void
 kibnal_api_shutdown (nal_t *nal)
 {
-        int   i;
-        int   rc;
-        vv_return_t retval;
+        int         i;
+        vv_return_t vvrc;
 
         if (nal->nal_refct != 0) {
                 /* This module got the first ref */
@@ -1178,16 +1380,16 @@ kibnal_api_shutdown (nal_t *nal)
                 libcfs_nal_cmd_unregister(VIBNAL);
                 /* No new peers */
 
-                /* resetting my NID to unadvertises me, removes my
-                 * listener and nukes all current peers */
+                /* resetting my NID removes my listener and nukes all current
+                 * peers and their connections */
                 kibnal_set_mynid (PTL_NID_ANY);
 
-                /* Wait for all peer state to clean up (crazy) */
+                /* Wait for all peer state to clean up */
                 i = 2;
                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
-                               "waiting for %d peers to disconnect (can take a few seconds)\n",
+                               "waiting for %d peers to disconnect\n",
                                atomic_read (&kibnal_data.kib_npeers));
                         set_current_state (TASK_UNINTERRUPTIBLE);
                         schedule_timeout (HZ);
@@ -1195,56 +1397,36 @@ kibnal_api_shutdown (nal_t *nal)
                 /* fall through */
 
         case IBNAL_INIT_CQ:
-                retval = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq);
-                if (retval)
-                        CERROR ("Destroy CQ error: %d\n", retval);
+                vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq);
+                if (vvrc != vv_return_ok)
+                        CERROR ("Destroy CQ error: %d\n", vvrc);
                 /* fall through */
 
         case IBNAL_INIT_TXD:
                 kibnal_free_pages (kibnal_data.kib_tx_pages);
                 /* fall through */
 
-#if IBNAL_FMR
-        case IBNAL_INIT_FMR:
-                rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
-                if (rc != 0)
-                        CERROR ("Destroy FMR pool error: %d\n", rc);
-                /* fall through */
-#endif
         case IBNAL_INIT_PD:
-#if IBNAL_WHOLE_MEM==0
-                retval = vv_pd_deallocate(kibnal_data.kib_hca, kibnal_data.kib_pd);
-                if (retval != 0)
-                        CERROR ("Destroy PD error: %d\n", retval);
+#if !IBNAL_WHOLE_MEM
+                vvrc = vv_pd_deallocate(kibnal_data.kib_hca,
+                                        kibnal_data.kib_pd);
+                if (vvrc != vv_return_ok)
+                        CERROR ("Destroy PD error: %d\n", vvrc);
 #endif
                 /* fall through */
 
-        case IBNAL_INIT_GSI:
-                retval = gsi_deregister_class(kibnal_data.gsi_handle);
-                if (retval != 0)
-                        CERROR ("GSI deregister failed: %d\n", retval);
-                /* fall through */
-
-        case IBNAL_INIT_GSI_POOL:
-                gsi_dtgrm_pool_destroy(kibnal_data.gsi_pool_handle);
-                /* fall through */
-
-        case IBNAL_INIT_PORT:
-                /* XXX ??? */
-                /* fall through */
-
         case IBNAL_INIT_ASYNC:
-                retval = vv_dell_async_event_cb (kibnal_data.kib_hca,
-                                                 kibnal_ca_async_callback);
-                if (retval)
-                        CERROR("deregister asynchronous call back error: %d\n", retval);
+                vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca,
+                                              kibnal_async_callback);
+                if (vvrc != vv_return_ok)
+                        CERROR("vv_dell_async_event_cb error: %d\n", vvrc);
                         
                 /* fall through */
 
         case IBNAL_INIT_HCA:
-                retval = vv_hca_close(kibnal_data.kib_hca);
-                if (retval != 0)
-                        CERROR ("Close HCA  error: %d\n", retval);
+                vvrc = vv_hca_close(kibnal_data.kib_hca);
+                if (vvrc != vv_return_ok)
+                        CERROR ("Close HCA  error: %d\n", vvrc);
                 /* fall through */
 
         case IBNAL_INIT_LIB:
@@ -1252,8 +1434,6 @@ kibnal_api_shutdown (nal_t *nal)
                 /* fall through */
 
         case IBNAL_INIT_DATA:
-                /* Module refcount only gets to zero when all peers
-                 * have been closed so all lists must be empty */
                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
                 LASSERT (kibnal_data.kib_peers != NULL);
                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
@@ -1262,7 +1442,9 @@ kibnal_api_shutdown (nal_t *nal)
                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
                 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
                 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+                LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+                LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs));
                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
 
                 /* flag threads to terminate; wake and wait for them to die */
@@ -1285,9 +1467,7 @@ kibnal_api_shutdown (nal_t *nal)
                 break;
         }
 
-        if (kibnal_data.kib_tx_descs != NULL)
-                PORTAL_FREE (kibnal_data.kib_tx_descs,
-                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        kibnal_free_tx_descs();
 
         if (kibnal_data.kib_peers != NULL)
                 PORTAL_FREE (kibnal_data.kib_peers,
@@ -1302,32 +1482,18 @@ kibnal_api_shutdown (nal_t *nal)
         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
 }
 
-#define roundup_power(val, power) \
-        ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
-
-/* this isn't very portable or sturdy in the face of funny mem/bus configs */
-static __u64 max_phys_mem(void)
-{
-        struct sysinfo si;
-        __u64 ret;
-
-        si_meminfo(&si);
-        ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
-        return roundup_power(ret, 128 * 1024 * 1024);
-} 
-#undef roundup_power
-
-static int
+int
 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                      ptl_ni_limits_t *requested_limits,
                      ptl_ni_limits_t *actual_limits)
 {
-        ptl_process_id_t    process_id;
-        int                 pkmem = atomic_read(&portal_kmemory);
-        int                 rc;
-        int                 i;
+        struct timeval            tv;
+        ptl_process_id_t          process_id;
+        int                       pkmem = atomic_read(&portal_kmemory);
+        int                       rc;
+        int                       i;
         vv_request_event_record_t req_er;
-        vv_return_t         retval;
+        vv_return_t               vvrc;
 
         LASSERT (nal == &kibnal_api);
 
@@ -1340,9 +1506,13 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         }
 
         LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+        memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
+        
+        do_gettimeofday(&tv);
+        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+        kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER;
 
         init_MUTEX (&kibnal_data.kib_nid_mutex);
-        kibnal_data.kib_nid = PTL_NID_ANY;
 
         rwlock_init(&kibnal_data.kib_global_lock);
 
@@ -1357,7 +1527,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
         spin_lock_init (&kibnal_data.kib_connd_lock);
         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs);
         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
 
         spin_lock_init (&kibnal_data.kib_sched_lock);
@@ -1370,22 +1542,18 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
         init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
 
-        INIT_LIST_HEAD (&kibnal_data.gsi_pending);
-        init_MUTEX (&kibnal_data.gsi_mutex);
-
-        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
-                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
-        if (kibnal_data.kib_tx_descs == NULL) {
-                CERROR ("Can't allocate tx descs\n");
+        rc = kibnal_alloc_tx_descs();
+        if (rc != 0) {
+                CERROR("Can't allocate tx descs\n");
                 goto failed;
         }
-
+        
         /* lists/ptrs/locks initialised */
         kibnal_data.kib_init = IBNAL_INIT_DATA;
         /*****************************************************/
 
         process_id.pid = requested_pid;
-        process_id.nid = kibnal_data.kib_nid;
+        process_id.nid = PTL_NID_ANY;
         
         rc = lib_init(&kibnal_lib, nal, process_id,
                       requested_limits, actual_limits);
@@ -1399,7 +1567,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         /*****************************************************/
 
         for (i = 0; i < IBNAL_N_SCHED; i++) {
-                rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
+                rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i));
                 if (rc != 0) {
                         CERROR("Can't spawn vibnal scheduler[%d]: %d\n",
                                i, rc);
@@ -1414,9 +1582,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         }
 
         /* TODO: apparently only one adapter is supported */
-        retval = vv_hca_open("ANY_HCA", NULL, &kibnal_data.kib_hca);
-        if (retval) {
-                CERROR ("Can't open CA: %d\n", retval);
+        vvrc = vv_hca_open("ANY_HCA", NULL, &kibnal_data.kib_hca);
+        if (vvrc != vv_return_ok) {
+                CERROR ("Can't open CA: %d\n", vvrc);
                 goto failed;
         }
 
@@ -1425,12 +1593,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
         /* register to get HCA's asynchronous events. */
         req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK;
-        retval = vv_set_async_event_cb (kibnal_data.kib_hca,
-                                        req_er,
-                                        kibnal_ca_async_callback);
-
-        if (retval) {
-                CERROR ("Can't open CA: %d\n", retval);
+        vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er,
+                                     kibnal_async_callback);
+        if (vvrc != vv_return_ok) {
+                CERROR ("Can't open CA: %d\n", vvrc);
                 goto failed; 
         }
 
@@ -1438,10 +1604,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
         /*****************************************************/
 
-        retval = vv_hca_query(kibnal_data.kib_hca,
-                             &kibnal_data.kib_hca_attrs);
-        if (retval) {
-                CERROR ("Can't size port attrs: %d\n", retval);
+        vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs);
+        if (vvrc != vv_return_ok) {
+                CERROR ("Can't size port attrs: %d\n", vvrc);
                 goto failed;
         }
 
@@ -1453,9 +1618,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 u_int32_t tbl_count;
                 vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr;
 
-                retval = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
-                if (retval) {
-                        CERROR("vv_port_query failed for port %d: %d\n", port_num, retval);
+                vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
+                if (vvrc != vv_return_ok) {
+                        CERROR("vv_port_query failed for port %d: %d\n",
+                               port_num, vvrc);
                         continue;
                 }
 
@@ -1476,16 +1642,22 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                         kibnal_data.kib_port = port_num;
                         
                         tbl_count = 1;
-                        retval = vv_get_port_gid_tbl(kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_gid);
-                        if (retval) {
-                                CERROR("vv_get_port_gid_tbl failed for port %d: %d\n", port_num, retval);
+                        vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, 
+                                                   port_num, &tbl_count,
+                                                   &kibnal_data.kib_port_gid);
+                        if (vvrc != vv_return_ok) {
+                                CERROR("vv_get_port_gid_tbl failed "
+                                       "for port %d: %d\n", port_num, vvrc);
                                 continue;
                         }
 
                         tbl_count = 1;
-                        retval = vv_get_port_partition_tbl (kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_pkey);
-                        if (retval) {
-                                CERROR("vv_get_port_partition_tbl failed for port %d: %d\n", port_num, retval);
+                        vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, 
+                                                        port_num, &tbl_count,
+                                                        &kibnal_data.kib_port_pkey);
+                        if (vvrc != vv_return_ok) {
+                                CERROR("vv_get_port_partition_tbl failed "
+                                       "for port %d: %d\n", port_num, vvrc);
                                 continue;
                         }
 
@@ -1505,45 +1677,19 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         }
 
         CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n",
-               kibnal_data.kib_port, kibnal_data.kib_port_gid.scope.g.subnet, kibnal_data.kib_port_gid.scope.g.eui64);
-        CDEBUG(D_NET, "got guid "LPX64"\n", cpu_to_le64(kibnal_data.kib_port_gid.scope.g.eui64));
+               kibnal_data.kib_port, 
+               kibnal_data.kib_port_gid.scope.g.subnet, 
+               kibnal_data.kib_port_gid.scope.g.eui64);
         
-        /* Active port found */
-        kibnal_data.kib_init = IBNAL_INIT_PORT;
         /*****************************************************/
 
-        /* Prepare things to be able to send/receive MADS */
-        retval = gsi_dtgrm_pool_create(IBNAL_CONCURRENT_PEERS, &kibnal_data.gsi_pool_handle);
-        if (retval) {
-                CERROR("Could not create GSI pool: %d\n", retval);
-                goto failed;
-        }
-        kibnal_data.kib_init = IBNAL_INIT_GSI_POOL;
-
-        retval = gsi_register_class(MAD_CLASS_SUBN_ADM, /* TODO: correct? */
-                                2,     /* version */
-                                "ANY_HCA",
-#ifdef GSI_PASS_PORT_NUM
-                                kibnal_data.kib_port,
-#endif                   
-                                0, 0,
-                                vibnal_mad_sent_cb,    vibnal_mad_received_cb,
-                                NULL, &kibnal_data.gsi_handle);
-        if (retval) {
-                CERROR("Cannot register GSI class: %d\n", retval);
-                goto failed;
-        }
-
-        kibnal_data.kib_init = IBNAL_INIT_GSI;
-        /*****************************************************/
-
-#if IBNAL_WHOLE_MEM==0
-        retval = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
+#if !IBNAL_WHOLE_MEM
+        vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
 #else
-        retval = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
+        vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
 #endif
-        if (retval) {
-                CERROR ("Can't create PD: %d\n", retval);
+        if (vvrc != 0) {
+                CERROR ("Can't create PD: %d\n", vvrc);
                 goto failed;
         }
         
@@ -1551,35 +1697,6 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kibnal_data.kib_init = IBNAL_INIT_PD;
         /*****************************************************/
 
-#if IBNAL_FMR
-        {
-                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
-                struct ib_fmr_pool_param params = {
-                        .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
-                        .access            = (IB_ACCESS_LOCAL_WRITE |
-                                              IB_ACCESS_REMOTE_WRITE |
-                                              IB_ACCESS_REMOTE_READ),
-                        .pool_size         = pool_size,
-                        .dirty_watermark   = (pool_size * 3)/4,
-                        .flush_function    = NULL,
-                        .flush_arg         = NULL,
-                        .cache             = 1,
-                };
-                rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
-                                        &kibnal_data.kib_fmr_pool);
-                if (rc != 0) {
-                        CERROR ("Can't create FMR pool size %d: %d\n", 
-                                pool_size, rc);
-                        goto failed;
-                }
-        }
-
-        /* flag FMR pool initialised */
-        kibnal_data.kib_init = IBNAL_INIT_FMR;
-#endif
-
-        /*****************************************************/
-
         rc = kibnal_setup_tx_descs();
         if (rc != 0) {
                 CERROR ("Can't register tx descs: %d\n", rc);
@@ -1592,12 +1709,12 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         {
                 uint32_t nentries;
 
-                retval = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
-                                      kibnal_ca_callback, 
-                                      NULL, /* context */
-                                      &kibnal_data.kib_cq, &nentries);
-                if (retval) {
-                        CERROR ("Can't create RX CQ: %d\n", retval);
+                vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
+                                    kibnal_cq_callback, 
+                                    NULL, /* context */
+                                    &kibnal_data.kib_cq, &nentries);
+                if (vvrc != 0) {
+                        CERROR ("Can't create RX CQ: %d\n", vvrc);
                         goto failed;
                 }
 
@@ -1610,8 +1727,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                         goto failed;
                 }
 
-                retval = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event);
-                if (retval != 0) {
+                vvrc = vv_request_completion_notification(kibnal_data.kib_hca, 
+                                                          kibnal_data.kib_cq, 
+                                                          vv_next_solicit_unsolicit_event);
+                if (vvrc != 0) {
                         CERROR ("Failed to re-arm completion queue: %d\n", rc);
                         goto failed;
                 }
@@ -1657,16 +1776,17 @@ kibnal_module_init (void)
 {
         int    rc;
 
-        if (sizeof(kib_wire_connreq_t) > cm_REQ_priv_data_len) {
-                CERROR("sizeof(kib_wire_connreq_t) > cm_REQ_priv_data_len\n");
-                return -EINVAL;
-        }
-
+        CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) 
+                  <= cm_REQ_priv_data_len);
+        CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) 
+                  <= cm_REP_priv_data_len);
+        CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
+                  <= IBNAL_MSG_SIZE);
+        CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
+                  <= IBNAL_MSG_SIZE);
+        
         /* the following must be sizeof(int) for proc_dointvec() */
-        if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
-                CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
-                return -EINVAL;
-        }
+        CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
 
         kibnal_api.nal_ni_init = kibnal_api_startup;
         kibnal_api.nal_ni_fini = kibnal_api_shutdown;
index cf90aed..785494a 100644 (file)
 #include <linux/list.h>
 #include <linux/kmod.h>
 #include <linux/sysctl.h>
+#include <linux/random.h>
 
-#define DEBUG_SUBSYSTEM S_IBNAL
+#include <net/sock.h>
+#include <linux/in.h>
 
-#define IBNAL_CHECK_ADVERT
+#define DEBUG_SUBSYSTEM S_NAL
 
 #include <libcfs/kp30.h>
 #include <portals/p30.h>
 #include <portals/lib-p30.h>
 #include <portals/nal.h>
 
-#include <vverbs.h>
-#include <sa-mads.h>
-#include <ib-cm.h>
-#include <gsi.h>
-
-#if 0
-#undef CDEBUG
-#define CDEBUG(mask, format, a...) printk(KERN_INFO "%s:%d - " format, __func__, __LINE__,##a)
+/* CPU_{L,B}E #defines needed by Voltaire headers */
+#include <asm/byteorder.h>
+#ifdef __BIG_ENDIAN__
+#define CPU_BE 1
+#define CPU_LE 0
 #endif
-
-#ifdef __CHECKER__
-#undef CDEBUG
-#undef CERROR
-#define CDEBUG(a...)
-#define CERROR(a...)
+#ifdef __LITTLE_ENDIAN__
+#define CPU_BE 0
+#define CPU_LE 1
 #endif
 
-#define GCC_VERSION (__GNUC__ * 10000 \
-                + __GNUC_MINOR__ * 100 \
-                + __GNUC_PATCHLEVEL__)
+#include <vverbs.h>
+#include <ib-cm.h>
+#include <ibat.h>
 
-/* Test for GCC > 3.2.2 */
-#if GCC_VERSION <= 30202
-/* GCC 3.2.2, and presumably several versions before it, will
- * miscompile this driver. See
- * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */
+/* GCC 3.2.2, miscompiles this driver.  
+ * See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */
+#define GCC_VERSION ((__GNUC__*100 + __GNUC_MINOR__)*100 + __GNUC_PATCHLEVEL__)
+#if GCC_VERSION < 30203
 #error Invalid GCC version. Must use GCC >= 3.2.3
 #endif
 
-#define IBNAL_SERVICE_NAME   "vibnal"
-#define IBNAL_SERVICE_NUMBER 0x11b9a2 /* TODO */
-
 #if CONFIG_SMP
 # define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
 #else
 # define IBNAL_N_SCHED      1                   /* # schedulers */
 #endif
 
+/* sdp-connection.c */
+#define IBNAL_QKEY               0
+#define IBNAL_PKEY               0xffff
+#define IBNAL_PKEY_IDX           0
+#define IBNAL_SGID_IDX           0
+#define IBNAL_SERVICE_LEVEL      0
+#define IBNAL_STATIC_RATE        0
+#define IBNAL_RETRY_CNT          7
+#define IBNAL_RNR_CNT            7 
+#define IBNAL_EE_FLOW_CNT        1
+#define IBNAL_LOCAL_SUB          1
+#define IBNAL_TRAFFIC_CLASS      0
+#define IBNAL_SOURCE_PATH_BIT    0
+#define IBNAL_OUS_DST_RD         32
+#define IBNAL_IB_MTU             vv_mtu_1024
+
+/* sdp-hca-params.h */
+#define PATH_RATE_2_5GB           2
+#define MLX_IPD_1x                1
+#define MLX_IPD_4x                0
+#define IBNAL_R_2_STATIC_RATE(r)  ((r) == PATH_RATE_2_5GB ? MLX_IPD_1x : MLX_IPD_4x)
+
+/* other low-level IB constants */
+#define IBNAL_LOCAL_ACK_TIMEOUT   0x12
+#define IBNAL_PKT_LIFETIME        5
+#define IBNAL_ARB_INITIATOR_DEPTH 0
+#define IBNAL_ARB_RESP_RES        0
+#define IBNAL_FAILOVER_ACCEPTED   0
+#define IBNAL_SERVICE_NUMBER      0x11b9a2      /* Fixed service number */
+
 #define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
 #define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
 
-#define IBNAL_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
-
-#define IBNAL_MSG_QUEUE_SIZE   8                /* # messages/RDMAs in-flight */
-#define IBNAL_CREDIT_HIGHWATER 7                /* when to eagerly return credits */
+#define IBNAL_MSG_SIZE           (4<<10)        /* max size of queued messages (inc hdr) */
 
-/* 7 indicates infinite retry attempts, Infinicon recommended 5 */
-#define IBNAL_RETRY            5                /* # times to retry */
-#define IBNAL_RNR_RETRY        5                /*  */
-#define IBNAL_CM_RETRY         5                /* # times to retry connection */
+#define IBNAL_MSG_QUEUE_SIZE      8             /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER    7             /* when to eagerly return credits */
 
-#define IBNAL_FLOW_CONTROL     1
-#define IBNAL_ACK_TIMEOUT       20              /* supposedly 4 secs */
+#define IBNAL_NTX                 64            /* # tx descs */
+#define IBNAL_NTX_NBLK            128           /* # reserved tx descs */
+/* reduced from 256 to ensure we register < 255 pages per region.  
+ * this can change if we register all memory. */
 
-#define IBNAL_NTX             64                /* # tx descs */
-/* this had to be dropped down so that we only register < 255 pages per
- * region.  this will change if we register all memory. */
-#define IBNAL_NTX_NBLK        128               /* # reserved tx descs */
+#define IBNAL_PEER_HASH_SIZE      101           /* # peer lists */
 
-#define IBNAL_PEER_HASH_SIZE  101               /* # peer lists */
+#define IBNAL_RESCHED             100           /* # scheduler loops before reschedule */
 
-#define IBNAL_RESCHED         100               /* # scheduler loops before reschedule */
+#define IBNAL_CONCURRENT_PEERS    1000          /* # nodes all talking at once to me */
 
-#define IBNAL_CONCURRENT_PEERS 1000             /* # nodes all talking at once to me */
+#define IBNAL_RDMA_BASE  0x0eeb0000
+#define IBNAL_CKSUM      0
+#define IBNAL_WHOLE_MEM  1
+#if !IBNAL_WHOLE_MEM
+# error "incompatible with voltaire adaptor-tavor (REGISTER_RAM_IN_ONE_PHY_MR)"
+#endif
 
 /* default vals for runtime tunables */
-#define IBNAL_IO_TIMEOUT      50                /* default comms timeout (seconds) */
+#define IBNAL_IO_TIMEOUT          50            /* default comms timeout (seconds) */
 
 /************************/
 /* derived constants... */
 #define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
 #define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
-#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1)
+#if IBNAL_WHOLE_MEM
+# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV
+#else
+# define IBNAL_MAX_RDMA_FRAGS 1
+#endif
 
 /* RX messages (per connection) */
 #define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
 #define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
 #define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
-
-/* we may have up to 2 completions per transmit +
-   1 completion per receive, per connection */
-#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
-                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
-
-#define IBNAL_RDMA_BASE  0x0eeb0000
-#define IBNAL_FMR        0
-#define IBNAL_WHOLE_MEM  1
-#define IBNAL_CKSUM      0
-
-/* Starting sequence number. */
-#define IBNAL_STARTING_PSN 0x465A
-
-/* Timeout for SA requests, in seconds */
-#define GSI_TIMEOUT 5
-#define GSI_RETRY 10
+#define IBNAL_CQ_ENTRIES  (IBNAL_TX_MSGS * (1 + IBNAL_MAX_RDMA_FRAGS) + \
+                           IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)
 
 typedef struct
 {
@@ -165,8 +175,6 @@ typedef struct
         struct ctl_table_header *kib_sysctl;    /* sysctl interface */
 } kib_tunables_t;
 
-/* some of these have specific types in the stack that just map back
- * to the uFOO types, like IB_{L,R}_KEY. */
 typedef struct
 {
         int               ibp_npages;           /* # pages */
@@ -184,51 +192,38 @@ typedef struct
         __u32             md_lkey;
         __u32             md_rkey;
         __u64             md_addr;
-} kib_md_t __attribute__((packed));
+} kib_md_t;
 
 typedef struct
 {
-        /* initialisation state. These values are sorted by their initialization order. */
-        enum {
-                IBNAL_INIT_NOTHING,
-                IBNAL_INIT_DATA,
-                IBNAL_INIT_LIB,
-                IBNAL_INIT_HCA,
-                IBNAL_INIT_ASYNC,
-                IBNAL_INIT_PORT,
-                IBNAL_INIT_GSI_POOL,
-                IBNAL_INIT_GSI,
-                IBNAL_INIT_PD,
-#if IBNAL_FMR
-                IBNAL_INIT_FMR,
-#endif
-                IBNAL_INIT_TXD,
-                IBNAL_INIT_CQ,
-                IBNAL_INIT_ALL,
-        } kib_init;
-
+        int               kib_init;             /* initialisation state */
         __u64             kib_incarnation;      /* which one am I */
         int               kib_shutdown;         /* shut down? */
         atomic_t          kib_nthreads;         /* # live threads */
 
-        __u64             kib_service_id;       /* service number I listen on */
-        vv_gid_t          kib_port_gid;         /* port GID in HOST ORDER! */
-        vv_p_key_t        kib_port_pkey;        /* my pkey */
-        ptl_nid_t         kib_nid;              /* my NID */
+        __u64             kib_svc_id;           /* service number I listen on */
+        vv_gid_t          kib_port_gid;         /* device/port GID */
+        vv_p_key_t        kib_port_pkey;        /* device/port pkey */
+        
         struct semaphore  kib_nid_mutex;        /* serialise NID ops */
-        cm_cep_handle_t   kib_cep;              /* connection end point */
+        cm_cep_handle_t   kib_listen_handle;    /* IB listen handle */
 
         rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
-
+        spinlock_t        kib_vverbs_lock;      /* serialize vverbs calls */
+        int               kib_ready;            /* CQ callback fired */
+        int               kib_checking_cq;      /* a scheduler is checking the CQ */
+        
         struct list_head *kib_peers;            /* hash table of all my known peers */
         int               kib_peer_hash_size;   /* size of kib_peers */
         atomic_t          kib_npeers;           /* # peers extant */
         atomic_t          kib_nconns;           /* # connections extant */
 
-        struct list_head  kib_connd_conns;      /* connections to progress */
-        struct list_head  kib_connd_peers;      /* peers waiting for a connection */
-        wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
-        unsigned long     kib_connd_waketime;   /* when connd will wake */
+        void             *kib_connd;            /* the connd task (serialisation assertions) */
+        struct list_head  kib_connd_peers;      /* peers wanting to get connected */
+        struct list_head  kib_connd_pcreqs;     /* passive connection requests */
+        struct list_head  kib_connd_conns;      /* connections to setup/teardown */
+        struct list_head  kib_connd_zombies;    /* connections with zero refcount */
+        wait_queue_head_t kib_connd_waitq;      /* connection daemon sleeps here */
         spinlock_t        kib_connd_lock;       /* serialise */
 
         wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
@@ -246,44 +241,36 @@ typedef struct
         spinlock_t        kib_tx_lock;          /* serialise */
 
         vv_hca_h_t        kib_hca;              /* The HCA */
-        vv_hca_attrib_t   kib_hca_attrs;      /* HCA attributes */
-
+        vv_hca_attrib_t   kib_hca_attrs;        /* its properties */
         int               kib_port;             /* port on the device */
-        vv_port_attrib_t  kib_port_attr;      /* port attributes */
+        vv_port_attrib_t  kib_port_attr;        /* its properties */
 
         vv_pd_h_t         kib_pd;               /* protection domain */
         vv_cq_h_t         kib_cq;               /* completion queue */
 
-        void             *kib_listen_handle;    /* where I listen for connections */
-
-        /* These fields are left untouched, so they can be shared. */
-        union {
-                cm_drequest_data_t dreq_data;
-                cm_dreply_data_t   drep_data;
-        } cm_data;
-
-        /* Send and receive MADs (service records, path records) */
-        gsi_class_handle_t      gsi_handle;
-        gsi_dtgrm_pool_handle_t gsi_pool_handle;
-        struct semaphore gsi_mutex; /* protect GSI list - TODO:spinlock instead? */
-        struct list_head gsi_pending; /* pending GSI datagrams */
-
 } kib_data_t;
 
+#define IBNAL_INIT_NOTHING         0
+#define IBNAL_INIT_DATA            1
+#define IBNAL_INIT_LIB             2
+#define IBNAL_INIT_HCA             3
+#define IBNAL_INIT_ASYNC           4
+#define IBNAL_INIT_PD              5
+#define IBNAL_INIT_TXD             6
+#define IBNAL_INIT_CQ              7
+#define IBNAL_INIT_ALL             8
+
 /************************************************************************
- * Wire message structs.
+ * IB Wire message format.
  * These are sent in sender's byte order (i.e. receiver flips).
- * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD
- * private data and SM service info), is LE on the wire.
  */
 
-/* also kib_md_t above */
-
-typedef struct
+typedef struct kib_connparams
 {
-        __u32                 rd_nob;           /* # of bytes */
-        __u64                 rd_addr;          /* remote io vaddr */
-} kib_rdma_desc_t __attribute__((packed));
+        __u32             ibcp_queue_depth;
+        __u32             ibcp_max_msg_size;
+        __u32             ibcp_max_frags;
+} kib_connparams_t __attribute__((packed));
 
 typedef struct
 {
@@ -291,54 +278,91 @@ typedef struct
         char              ibim_payload[0];      /* piggy-backed payload */
 } kib_immediate_msg_t __attribute__((packed));
 
-/* these arrays serve two purposes during rdma.  they are built on the passive
- * side and sent to the active side as remote arguments.  On the active side
- * the descs are used as a data structure on the way to local gather items.
- * the different roles result in split local/remote meaning of desc->rd_key */
+/* YEUCH! the __u64 address is split into 2 __u32 fields to ensure proper
+ * packing.  Otherwise we can't fit enough frags into an IBNAL message (<=
+ * smallest page size on any arch). */
+typedef struct
+{
+        __u32             rf_nob;               /* # of bytes */
+        __u32             rf_addr_lo;           /* lo 4 bytes of vaddr */
+        __u32             rf_addr_hi;           /* hi 4 bytes of vaddr */
+} kib_rdma_frag_t __attribute__((packed));
+
 typedef struct
 {
-        ptl_hdr_t         ibrm_hdr;             /* portals header */
-        __u64             ibrm_cookie;          /* opaque completion cookie */
-        __u32             ibrm_num_descs;       /* how many descs */
-        __u32             rd_key;               /* remote key */
-        kib_rdma_desc_t   ibrm_desc[0];         /* where to suck/blow */
-} kib_rdma_msg_t __attribute__((packed));
+        __u32             rd_key;               /* local/remote key */
+        __u32             rd_nfrag;             /* # fragments */
+        kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
+} kib_rdma_desc_t __attribute__((packed));
+
+/* CAVEAT EMPTOR!  We don't actually put ibprm_rd on the wire; it's just there
+ * to remember the source buffers while we wait for the PUT_ACK */
 
-#define kib_rdma_msg_len(num_descs) \
-        offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs])
+typedef struct
+{
+        ptl_hdr_t         ibprm_hdr;            /* portals header */
+        __u64             ibprm_cookie;         /* opaque completion cookie */
+        kib_rdma_frag_t   ibprm_rd;             /* source buffer */
+} kib_putreq_msg_t __attribute__((packed));
+
+typedef struct
+{
+        __u64             ibpam_src_cookie;     /* reflected completion cookie */
+        __u64             ibpam_dst_cookie;     /* opaque completion cookie */
+        kib_rdma_desc_t   ibpam_rd;             /* sender's sink buffer */
+} kib_putack_msg_t __attribute__((packed));
+
+typedef struct
+{
+        ptl_hdr_t         ibgm_hdr;             /* portals header */
+        __u64             ibgm_cookie;          /* opaque completion cookie */
+        kib_rdma_desc_t   ibgm_rd;              /* rdma descriptor */
+} kib_get_msg_t __attribute__((packed));
 
 typedef struct
 {
         __u64             ibcm_cookie;          /* opaque completion cookie */
-        __u32             ibcm_status;          /* completion status */
+        __s32             ibcm_status;          /* < 0 failure: >= 0 length */
 } kib_completion_msg_t __attribute__((packed));
 
 typedef struct
 {
-        __u32              ibm_magic;           /* I'm an openibnal message */
-        __u16              ibm_version;         /* this is my version number */
-        __u8               ibm_type;            /* msg type */
-        __u8               ibm_credits;         /* returned credits */
-#if IBNAL_CKSUM
-        __u32              ibm_nob;
-        __u32              ibm_cksum;
-#endif
+        /* First 2 fields fixed FOR ALL TIME */
+        __u32             ibm_magic;            /* I'm an openibnal message */
+        __u16             ibm_version;          /* this is my version number */
+
+        __u8              ibm_type;             /* msg type */
+        __u8              ibm_credits;          /* returned credits */
+        __u32             ibm_nob;              /* # bytes in whole message */
+        __u32             ibm_cksum;            /* checksum (0 == no checksum) */
+        __u64             ibm_srcnid;           /* sender's NID */
+        __u64             ibm_srcstamp;         /* sender's incarnation */
+        __u64             ibm_dstnid;           /* destination's NID */
+        __u64             ibm_dststamp;         /* destination's incarnation */
+
         union {
+                kib_connparams_t      connparams;
                 kib_immediate_msg_t   immediate;
-                kib_rdma_msg_t        rdma;
+                kib_putreq_msg_t      putreq;
+                kib_putack_msg_t      putack;
+                kib_get_msg_t         get;
                 kib_completion_msg_t  completion;
         } ibm_u __attribute__((packed));
 } kib_msg_t __attribute__((packed));
 
 #define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
-#define IBNAL_MSG_VERSION              1        /* current protocol version */
+#define IBNAL_MSG_VERSION              4        /* current protocol version */
 
+#define IBNAL_MSG_CONNREQ           0xc0        /* connection request */
+#define IBNAL_MSG_CONNACK           0xc1        /* connection acknowledge */
 #define IBNAL_MSG_NOOP              0xd0        /* nothing (just credits) */
-#define IBNAL_MSG_IMMEDIATE         0xd1        /* portals hdr + payload */
-#define IBNAL_MSG_PUT_RDMA          0xd2        /* portals PUT hdr + source rdma desc */
-#define IBNAL_MSG_PUT_DONE          0xd3        /* signal PUT rdma completion */
-#define IBNAL_MSG_GET_RDMA          0xd4        /* portals GET hdr + sink rdma desc */
-#define IBNAL_MSG_GET_DONE          0xd5        /* signal GET rdma completion */
+#define IBNAL_MSG_IMMEDIATE         0xd1        /* immediate */
+#define IBNAL_MSG_PUT_REQ           0xd2        /* putreq (src->sink) */
+#define IBNAL_MSG_PUT_NAK           0xd3        /* completion (sink->src) */
+#define IBNAL_MSG_PUT_ACK           0xd4        /* putack (sink->src) */
+#define IBNAL_MSG_PUT_DONE          0xd5        /* completion (src->sink) */
+#define IBNAL_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
+#define IBNAL_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
 
 /***********************************************************************/
 
@@ -346,14 +370,26 @@ typedef struct kib_rx                           /* receive message */
 {
         struct list_head          rx_list;      /* queue for attention */
         struct kib_conn          *rx_conn;      /* owning conn */
-        int                       rx_rdma;      /* RDMA completion posted? */
+        int                       rx_responded; /* responded to peer? */
         int                       rx_posted;    /* posted? */
-        kib_msg_t                *rx_msg;     /* pre-mapped buffer */
-        vv_l_key_t                l_key;
-        vv_wr_t                   rx_wrq;
+#if IBNAL_WHOLE_MEM
+        vv_l_key_t                rx_lkey;      /* local key */
+#else        
+        __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+#endif
+        kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
+        vv_wr_t                   rx_wrq;       /* receive work item */
         vv_scatgat_t              rx_gl;        /* and its memory */
 } kib_rx_t;
 
+#if IBNAL_WHOLE_MEM
+# define KIBNAL_RX_VADDR(rx) ((__u64)((unsigned long)((rx)->rx_msg)))
+# define KIBNAL_RX_LKEY(rx)  ((rx)->rx_lkey)
+#else
+# define KIBNAL_RX_VADDR(rx) ((rx)->rx_vaddr)
+# define KIBNAL_RX_LKEY(rx)  ((rx)->rx_conn->ibc_rx_pages->ibp_lkey)
+#endif
+
 typedef struct kib_tx                           /* transmit message */
 {
         struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
@@ -361,55 +397,59 @@ typedef struct kib_tx                           /* transmit message */
         struct kib_conn          *tx_conn;      /* owning conn */
         int                       tx_mapped;    /* mapped for RDMA? */
         int                       tx_sending;   /* # tx callbacks outstanding */
+        int                       tx_waiting;   /* waiting for peer */
         int                       tx_status;    /* completion status */
         unsigned long             tx_deadline;  /* completion deadline */
-        int                       tx_passive_rdma; /* peer sucks/blows */
-        int                       tx_passive_rdma_wait; /* waiting for peer to complete */
-        __u64                     tx_passive_rdma_cookie; /* completion cookie */
+        __u64                     tx_cookie;    /* completion cookie */
         lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
+#if IBNAL_WHOLE_MEM
+        vv_l_key_t                tx_lkey;      /* local key for message buffer */
+#else
         kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
-        kib_msg_t                *tx_msg;       /* pre-mapped buffer */
-        vv_l_key_t                l_key;
-        vv_r_key_t                r_key;
-        int                       tx_nsp;       /* # send work items */
-        vv_wr_t                  tx_wrq[IBNAL_TX_MAX_SG];    /* send work items... */
-        vv_scatgat_t              tx_gl[IBNAL_TX_MAX_SG];     /* ...and their memory */
+        __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+#endif
+        kib_msg_t                *tx_msg;       /* message buffer (host vaddr) */
+        int                       tx_nwrq;      /* # send work items */
+        vv_wr_t                  *tx_wrq;       /* send work items... */
+        vv_scatgat_t             *tx_gl;        /* ...and their memory */
+        kib_rdma_desc_t          *tx_rd;        /* rdma descriptor (src buffers) */
 } kib_tx_t;
 
+#if IBNAL_WHOLE_MEM
+# define KIBNAL_TX_VADDR(tx) ((__u64)((unsigned long)((tx)->tx_msg)))
+# define KIBNAL_TX_LKEY(tx)  ((tx)->tx_lkey)
+#else
+# define KIBNAL_TX_VADDR(tx) ((tx)->tx_vaddr)
+# define KIBNAL_TX_LKEY(tx)  (kibnal_data.kib_tx_pages->ibp_lkey)
+#endif
+
 #define KIB_TX_UNMAPPED       0
 #define KIB_TX_MAPPED         1
-#define KIB_TX_MAPPED_FMR     2
-
-typedef struct kib_wire_connreq
-{
-        __u32        wcr_magic;                 /* I'm an openibnal connreq */
-        __u16        wcr_version;               /* this is my version number */
-        __u16        wcr_queue_depth;           /* this is my receive queue size */
-        __u64        wcr_nid;                   /* peer's NID */
-        __u64        wcr_incarnation;           /* peer's incarnation */
-} kib_wire_connreq_t;
-
-typedef struct kib_gid
-{
-        __u64   hi, lo;
-} kib_gid_t;
 
-typedef struct kib_connreq
-{
-        /* connection-in-progress */
-        struct kib_conn                    *cr_conn;
-        kib_wire_connreq_t                  cr_wcr;
-        __u64                               cr_tid;
-        //ib_service_record_v2_t              cr_service;
-        kib_gid_t                           cr_gid;
-        ib_path_record_v2_t                 cr_path;
-
-        union {
-                cm_request_data_t                   cr_cm_req;
-                cm_rtu_data_t                       cr_cm_rtu;
-        } ;
-
-} kib_connreq_t;
+/* Passive connection request (listener callback) queued for handling by connd */
+typedef struct kib_pcreq
+{
+        struct list_head  pcr_list;             /* queue for handling by connd */
+        cm_cep_handle_t   pcr_cep;              /* listening handle */
+        cm_request_data_t pcr_cmreq;            /* request data */
+} kib_pcreq_t;
+
+typedef struct kib_connvars
+{
+        /* connection-in-progress variables */
+        __u32               cv_port;
+        __u32               cv_pkey_index;
+        __u32               cv_rnr_count;
+        __u32               cv_sgid_index;
+        __u32               cv_remote_qpn;
+        __u32               cv_local_qpn;
+        __u32               cv_rxpsn;
+        __u32               cv_txpsn;
+        ib_path_record_v2_t cv_path;
+        ibat_arp_data_t     cv_arp;
+        ibat_stat_t         cv_arprc;
+        cm_conn_data_t      cv_conndata;
+} kib_connvars_t;
 
 typedef struct kib_conn
 {
@@ -422,43 +462,39 @@ typedef struct kib_conn
         int                 ibc_nsends_posted;  /* # uncompleted sends */
         int                 ibc_credits;        /* # credits I have */
         int                 ibc_outstanding_credits; /* # credits to return */
-        int                 ibc_rcvd_disconnect;/* received discon request */
-        int                 ibc_sent_disconnect;/* sent discon request */
+        int                 ibc_disconnect;     /* some disconnect callback fired */
+        int                 ibc_comms_error;    /* set on comms error */
+        struct list_head    ibc_early_rxs;      /* rxs completed before ESTABLISHED */
         struct list_head    ibc_tx_queue;       /* send queue */
         struct list_head    ibc_active_txs;     /* active tx awaiting completion */
         spinlock_t          ibc_lock;           /* serialise */
         kib_rx_t           *ibc_rxs;            /* the rx descs */
         kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
         vv_qp_h_t           ibc_qp;             /* queue pair */
-        cm_cep_handle_t     ibc_cep;            /* connection ID? */
-        vv_qp_attr_t        ibc_qp_attrs;    /* QP attrs */
-        kib_connreq_t      *ibc_connreq;        /* connection request state */
+        cm_cep_handle_t     ibc_cep;            /* connection endpoint */
+        kib_connvars_t     *ibc_connvars;       /* in-progress connection state */
 } kib_conn_t;
 
-#define IBNAL_CONN_INIT_NOTHING      0          /* initial state */
-#define IBNAL_CONN_INIT_QP           1          /* ibc_qp set up */
-#define IBNAL_CONN_CONNECTING        2          /* started to connect */
-#define IBNAL_CONN_ESTABLISHED       3          /* connection established */
-#define IBNAL_CONN_SEND_DREQ         4          /* to send disconnect req */
-#define IBNAL_CONN_DREQ              5          /* sent disconnect req */
-#define IBNAL_CONN_DREP              6          /* sent disconnect rep */
-#define IBNAL_CONN_DISCONNECTED      7          /* no more QP or CM traffic */
-
-#define KIB_ASSERT_CONN_STATE(conn, state) do {                         \
-        LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state);  \
-} while (0)
-
-#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do {               \
-        LASSERTF(low <= high, "%d %d\n", low, high);                    \
-        LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \
-                 "%d\n", conn->ibc_state);                              \
-} while (0)
+#define IBNAL_CONN_INIT_NOTHING       0         /* incomplete init */
+#define IBNAL_CONN_INIT               1         /* completed init */
+#define IBNAL_CONN_ACTIVE_ARP         2         /* active arping */
+#define IBNAL_CONN_ACTIVE_CONNECT     3         /* active sending req */
+#define IBNAL_CONN_ACTIVE_CHECK_REPLY 4         /* active checking reply */
+#define IBNAL_CONN_ACTIVE_RTU         5         /* active sending rtu */
+#define IBNAL_CONN_PASSIVE_WAIT       6         /* passive waiting for rtu */
+#define IBNAL_CONN_ESTABLISHED        7         /* connection established */
+#define IBNAL_CONN_DISCONNECT1        8         /* disconnect phase 1 */
+#define IBNAL_CONN_DISCONNECT2        9         /* disconnect phase 2 */
+#define IBNAL_CONN_DISCONNECTED       10        /* disconnect complete */
 
 typedef struct kib_peer
 {
         struct list_head    ibp_list;           /* stash on global peer list */
         struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
         ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
+        __u32               ibp_ip;             /* IP to query for peer conn params */
+        int                 ibp_port;           /* port to qery for peer conn params */
+        __u64               ibp_incarnation;    /* peer's incarnation */
         atomic_t            ibp_refcount;       /* # users */
         int                 ibp_persistence;    /* "known" peer refs */
         struct list_head    ibp_conns;          /* all active connections */
@@ -468,75 +504,95 @@ typedef struct kib_peer
         unsigned long       ibp_reconnect_interval; /* exponential backoff */
 } kib_peer_t;
 
-struct sa_request;
-typedef void (*sa_request_cb_t)(struct sa_request *request);
-
-struct sa_request {
-        /* Link all the pending GSI datagrams together. */
-        struct list_head list;
 
-        int retry;              /* number of retries left (after a timeout only) */
-        int status;             /* status of the request */
-        gsi_dtgrm_t *dtgrm_req; /* request */
-        gsi_dtgrm_t *dtgrm_resp; /* response */
-        sa_mad_v2_t *mad;       /* points inside the datagram */
+extern lib_nal_t       kibnal_lib;
+extern kib_data_t      kibnal_data;
+extern kib_tunables_t  kibnal_tunables;
 
-        void *context;
+extern void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob);
+extern void kibnal_pack_msg(kib_msg_t *msg, int credits, 
+                            ptl_nid_t dstnid, __u64 dststamp);
+extern int kibnal_unpack_msg(kib_msg_t *msg, int nob);
+extern kib_peer_t *kibnal_create_peer(ptl_nid_t nid);
+extern void kibnal_destroy_peer(kib_peer_t *peer);
+extern int kibnal_del_peer(ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked(ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked(kib_peer_t *peer);
+extern int  kibnal_close_stale_conns_locked(kib_peer_t *peer,
+                                            __u64 incarnation);
+extern kib_conn_t *kibnal_create_conn(cm_cep_handle_t cep);
+extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg);
 
-        struct timer_list timer;
+extern int kibnal_alloc_pages(kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages(kib_pages_t *p);
 
-        /* When the requests is completed, we either call the callback
-         * or post a completion. They are mutually exclusive. */
-        struct completion signal;
-        sa_request_cb_t callback;
-};
+extern void kibnal_check_sends(kib_conn_t *conn);
+extern void kibnal_close_conn_locked(kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn(kib_conn_t *conn);
+extern int  kibnal_thread_start(int (*fn)(void *arg), void *arg);
+extern int  kibnal_scheduler(void *arg);
+extern int  kibnal_connd(void *arg);
+extern void kibnal_init_tx_msg(kib_tx_t *tx, int type, int body_nob);
+extern void kibnal_close_conn(kib_conn_t *conn, int why);
+extern int  kibnal_set_qp_state(kib_conn_t *conn, vv_qp_state_t new_state);
+extern void kibnal_async_callback(vv_event_record_t ev);
+extern void kibnal_cq_callback(unsigned long context);
+extern void kibnal_passive_connreq(kib_pcreq_t *pcr, int reject);
+extern void kibnal_pause(int ticks);
+extern void kibnal_queue_tx(kib_tx_t *tx, kib_conn_t *conn);
+extern int  kibnal_init_rdma(kib_tx_t *tx, int type, int nob,
+                             kib_rdma_desc_t *dstrd, __u64 dstcookie);
 
-/* The CM callback are called on the interrupt level. However we
- * cannot do everything we want on that level, so we let keventd run
- * the callback. */
-struct cm_off_level {
-        struct tq_struct tq;
+static inline int
+wrq_signals_completion (vv_wr_t *wrq)
+{
+        return wrq->completion_notification != 0;
+}
 
-        cm_cep_handle_t cep;
-        cm_conn_data_t *info;
-        kib_conn_t *conn;
-};
+static inline void
+kibnal_conn_addref (kib_conn_t *conn)
+{
+        CDEBUG(D_NET, "++conn[%p] (%d)\n",
+               conn, atomic_read(&conn->ibc_refcount));
+        LASSERT(atomic_read(&conn->ibc_refcount) > 0);
+        atomic_inc(&conn->ibc_refcount);
+}
 
-extern lib_nal_t       kibnal_lib;
-extern kib_data_t      kibnal_data;
-extern kib_tunables_t  kibnal_tunables;
+static inline void
+kibnal_conn_decref (kib_conn_t *conn)
+{
+        unsigned long   flags;
+
+        CDEBUG(D_NET, "--conn[%p] (%d)\n",
+               conn, atomic_read(&conn->ibc_refcount));
+        LASSERT(atomic_read(&conn->ibc_refcount) > 0);
+        if (atomic_dec_and_test(&conn->ibc_refcount)) {
+                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+                list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_zombies);
+                wake_up(&kibnal_data.kib_connd_waitq);
+                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+        }
+}
 
-static inline int wrq_signals_completion(vv_wr_t *wrq)
+static inline void
+kibnal_peer_addref (kib_peer_t *peer)
 {
-        return wrq->completion_notification != 0;
+        CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n",
+               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount));
+        LASSERT(atomic_read(&peer->ibp_refcount) > 0);
+        atomic_inc(&peer->ibp_refcount);
 }
 
-/******************************************************************************/
-
-/* these are purposely avoiding using local vars so they don't increase
- * stack consumption. */
-
-#define kib_peer_addref(peer) do {                                      \
-        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
-                 atomic_read(&peer->ibp_refcount));                     \
-        CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n",                   \
-               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
-        atomic_inc(&peer->ibp_refcount);                                \
-} while (0)
-
-#define kib_peer_decref(peer) do {                                      \
-        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
-                 atomic_read(&peer->ibp_refcount));                     \
-        CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n",                   \
-               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
-        if (atomic_dec_and_test (&peer->ibp_refcount)) {                \
-                CDEBUG (D_NET, "destroying peer "LPX64" %p\n",          \
-                        peer->ibp_nid, peer);                           \
-                kibnal_destroy_peer (peer);                             \
-        }                                                               \
-} while (0)
-
-/******************************************************************************/
+static inline void
+kibnal_peer_decref (kib_peer_t *peer)
+{
+        CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n",
+               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount));
+
+        LASSERT(atomic_read(&peer->ibp_refcount) > 0);
+        if (atomic_dec_and_test (&peer->ibp_refcount))
+                kibnal_destroy_peer (peer);
+}
 
 static inline struct list_head *
 kibnal_nid2peerlist (ptl_nid_t nid)
@@ -547,7 +603,7 @@ kibnal_nid2peerlist (ptl_nid_t nid)
 }
 
 static inline int
-kibnal_peer_active(kib_peer_t *peer)
+kibnal_peer_active (kib_peer_t *peer)
 {
         /* Am I in the peer hash table? */
         return (!list_empty(&peer->ibp_list));
@@ -558,43 +614,23 @@ kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
 {
         /* CAVEAT EMPTOR: tx takes caller's ref on conn */
 
-        LASSERT (tx->tx_nsp > 0);               /* work items set up */
-        LASSERT (tx->tx_conn == NULL);          /* only set here */
-
-        tx->tx_conn = conn;
+        LASSERT (tx->tx_nwrq > 0);              /* work items set up */
+        if (tx->tx_conn == NULL) {
+                kibnal_conn_addref(conn);
+                tx->tx_conn = conn;
+        } else {
+                LASSERT (tx->tx_conn == conn);
+                LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE);
+        }
         tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
         list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
 }
 
-static inline __u64*
-kibnal_service_nid_field(ib_service_record_v2_t *sr)
-{
-        /* The service key mask must have byte 0 to 7 set. */
-        return (__u64 *)sr->service_data8;
-}
-
-static inline void
-kibnal_set_service_keys(ib_service_record_v2_t *sr, ptl_nid_t nid)
-{
-        LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(sr->service_name));
-
-        strcpy (sr->service_name, IBNAL_SERVICE_NAME);
-
-        *kibnal_service_nid_field(sr) = cpu_to_le64(nid);
-}
-
-#if CONFIG_X86
-/* TODO: use vv_va2adverize instead */
 static inline __u64
 kibnal_page2phys (struct page *p)
 {
-        __u64 page_number = p - mem_map;
-
-        return (page_number << PAGE_SHIFT);
+        return page_to_phys(p);
 }
-#else
-# error "no page->phys"
-#endif
 
 /* CAVEAT EMPTOR: We rely on tx/rx descriptor alignment to allow us to
  * use the lowest bit of the work request id as a flag to determine if
@@ -622,199 +658,35 @@ kibnal_wreqid_is_rx (vv_wr_id_t wreqid)
         return (wreqid & 1) != 0;
 }
 
-static inline int
-kibnal_whole_mem(void)
-{
-#if IBNAL_WHOLE_MEM
-        return true;
-#else
-        return false;
-#endif
-}
-
-/* Voltaire stores GIDs in host order. */
-static inline void gid_swap(vv_gid_t *gid)
-{
-        u_int64_t s;
-
-        s = gid->scope.g.subnet;
-        gid->scope.g.subnet = cpu_to_be64(gid->scope.g.eui64);
-        gid->scope.g.eui64 = cpu_to_be64(s);
-}
-
-#if 0
-static void dump_qp(kib_conn_t *conn)
+static inline void
+kibnal_set_conn_state (kib_conn_t *conn, int state)
 {
-        vv_qp_attr_t *qp_attrs;
-        void *qp_context;
-        vv_return_t retval;
-
-        CERROR("QP dumping %p\n", conn);
-
-        retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs);
-        if (retval) {
-                CERROR ("Couldn't query qp attributes: %d\n", retval);
-                return;
-        }
-
-        qp_attrs = &conn->ibc_qp_attrs;
-
-        CERROR("QP %x dump\n", qp_attrs->query.qp_num);
-        CERROR("  vv_qp_attr_mask = %llx\n", qp_attrs->query.vv_qp_attr_mask);
-        CERROR("  qp_state = %d\n", qp_attrs->query.qp_state);
-        CERROR("  cq_send_h = %p\n", qp_attrs->query.cq_send_h);
-        CERROR("  cq_receive_h = %p \n", qp_attrs->query.cq_receive_h);
-        CERROR("  send_max_outstand_wr = %d\n", qp_attrs->query.send_max_outstand_wr);
-        CERROR("  receive_max_outstand_wr = %d\n", qp_attrs->query.receive_max_outstand_wr);
-        CERROR("  max_scatgat_per_send_wr = %d\n", qp_attrs->query.max_scatgat_per_send_wr);
-        CERROR("  max_scatgat_per_receive_wr = %d\n", qp_attrs->query.max_scatgat_per_receive_wr);
-        CERROR("  send_psn = %x\n", qp_attrs->query.send_psn);
-        CERROR("  receve_psn = %x\n", qp_attrs->query.receve_psn);
-        CERROR("  access_control = %x\n", qp_attrs->query.access_control);
-        CERROR("  phy_port_num = %d\n", qp_attrs->query.phy_port_num);
-        CERROR("  primary_p_key_indx = %x\n", qp_attrs->query.primary_p_key_indx);
-        CERROR("  q_key = %x\n", qp_attrs->query.q_key);
-        CERROR("  destanation_qp = %x\n", qp_attrs->query.destanation_qp);
-        CERROR("  rdma_r_atom_outstand_num = %d\n", qp_attrs->query.rdma_r_atom_outstand_num);
-        CERROR("  responder_rdma_r_atom_num = %d\n", qp_attrs->query.responder_rdma_r_atom_num);
-        CERROR("  min_rnr_nak_timer = %d\n", qp_attrs->query.min_rnr_nak_timer);
-        CERROR("  pd_h = %lx\n", qp_attrs->query.pd_h);
-        CERROR("  recv_solicited_events = %d\n", qp_attrs->query.recv_solicited_events);
-        CERROR("  send_signaled_comp = %d\n", qp_attrs->query.send_signaled_comp);
-        CERROR("  flow_control = %d\n", qp_attrs->query.flow_control);
+        conn->ibc_state = state;
+        mb();
 }
-#else
-#define dump_qp(a)
-#endif
 
-#if 0
-static void dump_wqe(vv_wr_t *wr)
+static inline __u64
+kibnal_rf_addr (kib_rdma_frag_t *rf)
 {
-        CERROR("Dumping send WR %p\n", wr);
-
-        CERROR("  wr_id = %llx\n", wr->wr_id);
-        CERROR("  completion_notification = %d\n", wr->completion_notification);
-        CERROR("  scatgat_list = %p\n", wr->scatgat_list);
-        CERROR("  num_of_data_segments = %d\n", wr->num_of_data_segments);
-
-        if (wr->scatgat_list && wr->num_of_data_segments) {
-                CERROR("    scatgat_list[0].v_address = %p\n", wr->scatgat_list[0].v_address);
-                CERROR("    scatgat_list[0].length = %d\n", wr->scatgat_list[0].length);
-                CERROR("    scatgat_list[0].l_key = %x\n", wr->scatgat_list[0].l_key);
-        }
-
-        CERROR("  wr_type = %d\n", wr->wr_type);
-
-        switch(wr->wr_type) {
-        case vv_wr_send:
-                CERROR("  send\n");
-
-                CERROR("  fance_indicator = %d\n", wr->type.send.send_qp_type.rc_type.fance_indicator);
-                break;
-
-        case vv_wr_receive:
-                break;
-
-        case vv_wr_rdma_write:
-        case vv_wr_rdma_read:
-                CERROR("  rdma\n");
-                CERROR("  fance_indicator = %d\n", wr->type.send.send_qp_type.rc_type.fance_indicator);
-                CERROR("  r_addr = %llx\n", wr->type.send.send_qp_type.rc_type.r_addr);
-                CERROR("  r_r_key = %x\n", wr->type.send.send_qp_type.rc_type.r_r_key);
-                break;
-
-        default:
-                break;
-        }
+        return  (((__u64)rf->rf_addr_hi)<<32) | ((__u64)rf->rf_addr_lo);
 }
 
-#else
-#define dump_wqe(a)
-#endif
-
-#if 0
-static void dump_wc(vv_wc_t *wc)
+static inline void
+kibnal_rf_set (kib_rdma_frag_t *rf, __u64 addr, int nob)
 {
-        CERROR("Dumping WC\n");
-
-        CERROR("  wr_id = %llx\n", wc->wr_id);
-        CERROR("  operation_type = %d\n", wc->operation_type);
-        CERROR("  num_bytes_transfered = %lld\n", wc->num_bytes_transfered);
-        CERROR("  completion_status = %d\n", wc->completion_status);
+        rf->rf_addr_lo = addr & 0xffffffff;
+        rf->rf_addr_hi = (addr >> 32) & 0xffffffff;
+        rf->rf_nob = nob;
 }
-#else
-#define dump_wc(a)
-#endif
-
-#if 0
-static void hexdump(char *string, void *ptr, int len)
-{
-        unsigned char *c = ptr;
-        int i;
-
-        if (len < 0 || len > 2048)  {
-                printk("XXX what the hell? %d\n",len);
-                return;
-        }
-
-        printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
 
-        for (i = 0; i < len;) {
-                printk("%02x",*(c++));
-                i++;
-                if (!(i & 15)) {
-                        printk("\n");
-                } else if (!(i&1)) {
-                        printk(" ");
-                }
-        }
-
-        if(len & 15) {
-                printk("\n");
-        }
+static inline int
+kibnal_rd_size (kib_rdma_desc_t *rd)
+{
+        int   i;
+        int   size;
+        
+        for (i = size = 0; i < rd->rd_nfrag; i++)
+                size += rd->rd_frags[i].rf_nob;
+        
+        return size;
 }
-#else
-#define hexdump(a,b,c)
-#endif
-
-/*--------------------------------------------------------------------------*/
-
-
-extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
-extern void kibnal_destroy_peer (kib_peer_t *peer);
-extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
-extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
-extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
-extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer,
-                                              __u64 incarnation);
-extern kib_conn_t *kibnal_create_conn (void);
-extern void kibnal_put_conn (kib_conn_t *conn);
-extern void kibnal_destroy_conn (kib_conn_t *conn);
-extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg);
-
-extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
-extern void kibnal_free_pages (kib_pages_t *p);
-
-extern void kibnal_check_sends (kib_conn_t *conn);
-extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
-extern void kibnal_destroy_conn (kib_conn_t *conn);
-extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
-extern int  kibnal_scheduler(void *arg);
-extern int  kibnal_connd (void *arg);
-extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
-extern void kibnal_close_conn (kib_conn_t *conn, int why);
-extern void kibnal_start_active_rdma (int type, int status,
-                                      kib_rx_t *rx, lib_msg_t *libmsg,
-                                      unsigned int niov,
-                                      struct iovec *iov, ptl_kiov_t *kiov,
-                                      size_t offset, size_t nob);
-
-void kibnal_ca_async_callback(vv_event_record_t ev);
-void kibnal_ca_callback (unsigned long context);
-extern void vibnal_mad_received_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm);
-extern void vibnal_mad_sent_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm);
-extern int kibnal_advertize_op(ptl_nid_t nid, int op, sa_request_cb_t callback, void *context);
-extern int vibnal_start_sa_request(struct sa_request *request);
-extern struct sa_request *alloc_sa_request(void);
-extern void free_sa_request(struct sa_request *request);
-extern int kibnal_pathrecord_op(struct sa_request *request, vv_gid_t dgid, sa_request_cb_t callback, void *context);
index e21d62f..ee860f0 100644 (file)
 
 #include "vibnal.h"
 
-static void kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg);
-
-/*
- *  LIB functions follow
- *
- */
-static void
-kibnal_schedule_tx_done (kib_tx_t *tx)
-{
-        unsigned long flags;
-
-        spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
-
-        list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
-        wake_up (&kibnal_data.kib_sched_waitq);
-
-        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
-}
-
-static void
+void
 kibnal_tx_done (kib_tx_t *tx)
 {
         ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
-        unsigned long    flags;
         int              i;
-        vv_return_t retval;
 
+        LASSERT (!in_interrupt());
         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
-        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
+        LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
 
+#if !IBNAL_WHOLE_MEM
         switch (tx->tx_mapped) {
         default:
                 LBUG();
@@ -61,35 +42,17 @@ kibnal_tx_done (kib_tx_t *tx)
         case KIB_TX_UNMAPPED:
                 break;
 
-        case KIB_TX_MAPPED:
-                if (in_interrupt()) {
-                        /* can't deregister memory in IRQ context... */
-                        kibnal_schedule_tx_done(tx);
-                        return;
-                }
-                retval = vv_mem_region_destroy(kibnal_data.kib_hca, tx->tx_md.md_handle);
-                LASSERT (retval == vv_return_ok);
-                tx->tx_mapped = KIB_TX_UNMAPPED;
-                break;
-
-#if IBNAL_FMR
-        case KIB_TX_MAPPED_FMR:
-                if (in_interrupt() && tx->tx_status != 0) {
-                        /* can't flush FMRs in IRQ context... */
-                        kibnal_schedule_tx_done(tx);
-                        return;
-                }              
-
-                rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
-                LASSERT (rc == 0);
+        case KIB_TX_MAPPED: {
+                vv_return_t      vvrc;
 
-                if (tx->tx_status != 0)
-                        ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+                vvrc = vv_mem_region_destroy(kibnal_data.kib_hca,
+                                             tx->tx_md.md_handle);
+                LASSERT (vvrc == vv_return_ok);
                 tx->tx_mapped = KIB_TX_UNMAPPED;
                 break;
-#endif
         }
-
+        }
+#endif
         for (i = 0; i < 2; i++) {
                 /* tx may have up to 2 libmsgs to finalise */
                 if (tx->tx_libmsg[i] == NULL)
@@ -100,15 +63,14 @@ kibnal_tx_done (kib_tx_t *tx)
         }
         
         if (tx->tx_conn != NULL) {
-                kibnal_put_conn (tx->tx_conn);
+                kibnal_conn_decref(tx->tx_conn);
                 tx->tx_conn = NULL;
         }
 
-        tx->tx_nsp = 0;
-        tx->tx_passive_rdma = 0;
+        tx->tx_nwrq = 0;
         tx->tx_status = 0;
 
-        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+        spin_lock(&kibnal_data.kib_tx_lock);
 
         if (tx->tx_isnblk) {
                 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
@@ -117,18 +79,17 @@ kibnal_tx_done (kib_tx_t *tx)
                 wake_up (&kibnal_data.kib_idle_tx_waitq);
         }
 
-        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+        spin_unlock(&kibnal_data.kib_tx_lock);
 }
 
-static kib_tx_t *
+kib_tx_t *
 kibnal_get_idle_tx (int may_block) 
 {
-        unsigned long  flags;
         kib_tx_t      *tx = NULL;
         ENTRY;
         
         for (;;) {
-                spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+                spin_lock(&kibnal_data.kib_tx_lock);
 
                 /* "normal" descriptor is free */
                 if (!list_empty (&kibnal_data.kib_idle_txs)) {
@@ -150,7 +111,7 @@ kibnal_get_idle_tx (int may_block)
                 }
 
                 /* block for idle tx */
-                spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+                spin_unlock(&kibnal_data.kib_tx_lock);
 
                 wait_event (kibnal_data.kib_idle_tx_waitq,
                             !list_empty (&kibnal_data.kib_idle_txs) ||
@@ -160,410 +121,437 @@ kibnal_get_idle_tx (int may_block)
         if (tx != NULL) {
                 list_del (&tx->tx_list);
 
-                /* Allocate a new passive RDMA completion cookie.  It might
-                 * not be needed, but we've got a lock right now and we're
-                 * unlikely to wrap... */
-                tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
-
+                /* Allocate a new completion cookie.  It might not be needed,
+                 * but we've got a lock right now and we're unlikely to
+                 * wrap... */
+                tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
+#if IBNAL_WHOLE_MEM
                 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
-                LASSERT (tx->tx_nsp == 0);
+#endif
+                LASSERT (tx->tx_nwrq == 0);
                 LASSERT (tx->tx_sending == 0);
+                LASSERT (!tx->tx_waiting);
                 LASSERT (tx->tx_status == 0);
                 LASSERT (tx->tx_conn == NULL);
-                LASSERT (!tx->tx_passive_rdma);
-                LASSERT (!tx->tx_passive_rdma_wait);
                 LASSERT (tx->tx_libmsg[0] == NULL);
                 LASSERT (tx->tx_libmsg[1] == NULL);
         }
 
-        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+        spin_unlock(&kibnal_data.kib_tx_lock);
         
         RETURN(tx);
 }
 
-static int
-kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
-{
-        /* I would guess that if kibnal_get_peer (nid) == NULL,
-           and we're not routing, then 'nid' is very distant :) */
-        if ( nal->libnal_ni.ni_pid.nid == nid ) {
-                *dist = 0;
-        } else {
-                *dist = 1;
-        }
-
-        return 0;
-}
-
-static void
-kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
-{
-        struct list_head *ttmp;
-        unsigned long     flags;
-        int               idle;
-
-        spin_lock_irqsave (&conn->ibc_lock, flags);
-
-        list_for_each (ttmp, &conn->ibc_active_txs) {
-                kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
-
-                LASSERT (tx->tx_passive_rdma ||
-                         !tx->tx_passive_rdma_wait);
-
-                LASSERT (tx->tx_passive_rdma_wait ||
-                         tx->tx_sending != 0);
-
-                if (!tx->tx_passive_rdma_wait ||
-                    tx->tx_passive_rdma_cookie != cookie)
-                        continue;
-
-                CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
-
-                tx->tx_status = status;
-                tx->tx_passive_rdma_wait = 0;
-                idle = (tx->tx_sending == 0);
-
-                if (idle)
-                        list_del (&tx->tx_list);
-
-                spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-                /* I could be racing with tx callbacks.  It's whoever
-                 * _makes_ tx idle that frees it */
-                if (idle)
-                        kibnal_tx_done (tx);
-                return;
-        }
-                
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-        CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
-                cookie, conn->ibc_peer->ibp_nid);
-}
-
-static void
-kibnal_post_rx (kib_rx_t *rx, int do_credits)
+int
+kibnal_post_rx (kib_rx_t *rx, int credit)
 {
         kib_conn_t   *conn = rx->rx_conn;
         int           rc = 0;
-        unsigned long flags;
-        vv_return_t retval;
+        vv_return_t   vvrc;
 
-        ENTRY;
+        LASSERT (!in_interrupt());
         
         rx->rx_gl = (vv_scatgat_t) {
-                .v_address = (void *)rx->rx_msg,
+                .v_address = (void *)((unsigned long)KIBNAL_RX_VADDR(rx)),
+                .l_key     = KIBNAL_RX_LKEY(rx),
                 .length    = IBNAL_MSG_SIZE,
-                .l_key     = rx->l_key,
         };
 
         rx->rx_wrq = (vv_wr_t) {
-                .wr_id                   = kibnal_ptr2wreqid(rx, 1),
+                .wr_id                   = (unsigned long)rx,
                 .completion_notification = 1,
                 .scatgat_list            = &rx->rx_gl,
                 .num_of_data_segments    = 1,
                 .wr_type                 = vv_wr_receive,
         };
 
-        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
-                                    IBNAL_CONN_DREP);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
         LASSERT (!rx->rx_posted);
-        rx->rx_posted = 1;
-        mb();
 
-        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
-                rc = -ECONNABORTED;
-        else {
-                retval = vv_post_receive(kibnal_data.kib_hca, conn->ibc_qp, &rx->rx_wrq);
+        CDEBUG(D_NET, "posting rx [%d %x %p]\n", 
+               rx->rx_wrq.scatgat_list->length,
+               rx->rx_wrq.scatgat_list->l_key,
+               rx->rx_wrq.scatgat_list->v_address);
 
-                if (retval) {
-                        CDEBUG(D_NET, "post failed %d\n", retval);
-                        rc = -EINVAL;
-                }
-                CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
+                /* No more posts for this rx; so lose its ref */
+                kibnal_conn_decref(conn);
+                return 0;
         }
+        
+        rx->rx_posted = 1;
+
+        spin_lock(&conn->ibc_lock);
+        /* Serialise vv_post_receive; it's not re-entrant on the same QP */
+        vvrc = vv_post_receive(kibnal_data.kib_hca,
+                               conn->ibc_qp, &rx->rx_wrq);
+        spin_unlock(&conn->ibc_lock);
 
-        if (rc == 0) {
-                if (do_credits) {
-                        spin_lock_irqsave(&conn->ibc_lock, flags);
+        if (vvrc == 0) {
+                if (credit) {
+                        spin_lock(&conn->ibc_lock);
                         conn->ibc_outstanding_credits++;
-                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                        spin_unlock(&conn->ibc_lock);
 
                         kibnal_check_sends(conn);
                 }
-                EXIT;
-                return;
+                return 0;
         }
+        
+        CERROR ("post rx -> "LPX64" failed %d\n", 
+                conn->ibc_peer->ibp_nid, vvrc);
+        rc = -EIO;
+        kibnal_close_conn(rx->rx_conn, rc);
+        /* No more posts for this rx; so lose its ref */
+        kibnal_conn_decref(conn);
+        return rc;
+}
 
-        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
-                CERROR ("Error posting receive -> "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, rc);
-                kibnal_close_conn (rx->rx_conn, rc);
-        } else {
-                CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, rc);
+int
+kibnal_post_receives (kib_conn_t *conn)
+{
+        int    i;
+        int    rc;
+
+        LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
+        LASSERT (conn->ibc_comms_error == 0);
+
+        for (i = 0; i < IBNAL_RX_MSGS; i++) {
+                /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
+                 * fails (i.e. actual failure or we're disconnecting) */
+                kibnal_conn_addref(conn);
+                rc = kibnal_post_rx (&conn->ibc_rxs[i], 0);
+                if (rc != 0)
+                        return rc;
         }
 
-        /* Drop rx's ref */
-        kibnal_put_conn (conn);
-        EXIT;
+        return 0;
 }
 
-#if IBNAL_CKSUM
-static inline __u32 kibnal_cksum (void *ptr, int nob)
+kib_tx_t *
+kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
 {
-        char  *c  = ptr;
-        __u32  sum = 0;
-
-        while (nob-- > 0)
-                sum = ((sum << 1) | (sum >> 31)) + *c++;
+        struct list_head   *tmp;
         
-        return (sum);
-}
-#endif
+        list_for_each(tmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+                
+                LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
 
-static void
-kibnal_rx_callback (vv_wc_t *wc)
-{
-        kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->wr_id);
-        kib_msg_t    *msg = rx->rx_msg;
-        kib_conn_t   *conn = rx->rx_conn;
-        int           nob = wc->num_bytes_transfered;
-        const int     base_nob = offsetof(kib_msg_t, ibm_u);
-        int           credits;
-        int           flipped;
-        unsigned long flags;
-        __u32         i;
-#if IBNAL_CKSUM
-        __u32         msg_cksum;
-        __u32         computed_cksum;
-#endif
+                if (tx->tx_cookie != cookie)
+                        continue;
 
-        /* we set the QP to erroring after we've finished disconnecting, 
-         * maybe we should do so sooner. */
-        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, 
-                                    IBNAL_CONN_DISCONNECTED);
+                if (tx->tx_waiting &&
+                    tx->tx_msg->ibm_type == txtype)
+                        return tx;
 
-        CDEBUG(D_NET, "rx %p conn %p, nob=%d\n", rx, conn, nob);
+                CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+                      tx->tx_waiting ? "" : "NOT ",
+                      tx->tx_msg->ibm_type, txtype);
+        }
+        return NULL;
+}
 
-        LASSERT (rx->rx_posted);
-        rx->rx_posted = 0;
-        mb();
+void
+kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+        kib_tx_t    *tx;
+        int          idle;
 
-        /* receives complete with error in any case after we've started
-         * disconnecting */
-        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
-                goto failed;
+        spin_lock(&conn->ibc_lock);
 
-        if (wc->completion_status != vv_comp_status_success) {
-                CERROR("Rx from "LPX64" failed: %d\n", 
-                       conn->ibc_peer->ibp_nid, wc->completion_status);
-                goto failed;
-        }
+        tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
+        if (tx == NULL) {
+                spin_unlock(&conn->ibc_lock);
 
-        if (nob < base_nob) {
-                CERROR ("Short rx from "LPX64": %d < expected %d\n",
-                        conn->ibc_peer->ibp_nid, nob, base_nob);
-                goto failed;
+                CWARN("Unmatched completion type %x cookie "LPX64
+                      " from "LPX64"\n",
+                      txtype, cookie, conn->ibc_peer->ibp_nid);
+                kibnal_close_conn (conn, -EPROTO);
+                return;
         }
 
-        /* Receiver does any byte flipping if necessary... */
+        if (tx->tx_status == 0) {               /* success so far */
+                if (status < 0) {               /* failed? */
+                        tx->tx_status = status;
+                } else if (txtype == IBNAL_MSG_GET_REQ) { 
+                        /* XXX layering violation: set REPLY data length */
+                        LASSERT (tx->tx_libmsg[1] != NULL);
+                        LASSERT (tx->tx_libmsg[1]->ev.type == 
+                                 PTL_EVENT_REPLY_END);
 
-        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
-                flipped = 0;
-        } else {
-                if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
-                        CERROR ("Unrecognised magic: %08x from "LPX64"\n", 
-                                msg->ibm_magic, conn->ibc_peer->ibp_nid);
-                        goto failed;
+                        tx->tx_libmsg[1]->ev.mlength = status;
                 }
-                flipped = 1;
-                __swab16s (&msg->ibm_version);
-                LASSERT (sizeof(msg->ibm_type) == 1);
-                LASSERT (sizeof(msg->ibm_credits) == 1);
         }
+        
+        tx->tx_waiting = 0;
 
-        if (msg->ibm_version != IBNAL_MSG_VERSION) {
-                CERROR ("Incompatible msg version %d (%d expected)\n",
-                        msg->ibm_version, IBNAL_MSG_VERSION);
-                goto failed;
-        }
+        idle = tx->tx_sending == 0;
+        if (idle)
+                list_del(&tx->tx_list);
 
-#if IBNAL_CKSUM
-        if (nob != msg->ibm_nob) {
-                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
-                goto failed;
-        }
+        spin_unlock(&conn->ibc_lock);
+        
+        if (idle)
+                kibnal_tx_done(tx);
+}
 
-        msg_cksum = le32_to_cpu(msg->ibm_cksum);
-        msg->ibm_cksum = 0;
-        computed_cksum = kibnal_cksum (msg, nob);
+void
+kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) 
+{
+        kib_tx_t    *tx = kibnal_get_idle_tx(0);
         
-        if (msg_cksum != computed_cksum) {
-                CERROR ("Checksum failure %d: (%d expected)\n",
-                        computed_cksum, msg_cksum);
-//                goto failed;
+        if (tx == NULL) {
+                CERROR("Can't get tx for completion %x for "LPX64"\n",
+                       type, conn->ibc_peer->ibp_nid);
+                return;
         }
-        CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
-#endif
+        
+        tx->tx_msg->ibm_u.completion.ibcm_status = status;
+        tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+        kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
+        
+        kibnal_queue_tx(tx, conn);
+}
+
+void
+kibnal_handle_rx (kib_rx_t *rx)
+{
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+        int           credits = msg->ibm_credits;
+        kib_tx_t     *tx;
+        int           rc;
+
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 
-        /* Have I received credits that will let me send? */
-        credits = msg->ibm_credits;
+        CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
+                msg->ibm_type, credits, conn->ibc_peer->ibp_nid);
+        
         if (credits != 0) {
-                spin_lock_irqsave(&conn->ibc_lock, flags);
+                /* Have I received credits that will let me send? */
+                spin_lock(&conn->ibc_lock);
                 conn->ibc_credits += credits;
-                spin_unlock_irqrestore(&conn->ibc_lock, flags);
-                
+                spin_unlock(&conn->ibc_lock);
+
                 kibnal_check_sends(conn);
         }
 
         switch (msg->ibm_type) {
+        default:
+                CERROR("Bad IBNAL message type %x from "LPX64"\n",
+                       msg->ibm_type, conn->ibc_peer->ibp_nid);
+                break;
+
         case IBNAL_MSG_NOOP:
-                kibnal_post_rx (rx, 1);
-                return;
+                break;
 
         case IBNAL_MSG_IMMEDIATE:
-                if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
-                        CERROR ("Short IMMEDIATE from "LPX64": %d\n",
-                                conn->ibc_peer->ibp_nid, nob);
-                        goto failed;
-                }
+                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
                 break;
                 
-        case IBNAL_MSG_PUT_RDMA:
-        case IBNAL_MSG_GET_RDMA:
-                if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
-                        CERROR ("Short RDMA msg from "LPX64": %d\n",
-                                conn->ibc_peer->ibp_nid, nob);
-                        goto failed;
-                }
-                if (flipped) 
-                        __swab32(msg->ibm_u.rdma.ibrm_num_descs);
-
-                CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
-                       msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
-
-                if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) ||
-                    (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > 
-                     min(nob, IBNAL_MSG_SIZE))) {
-                        CERROR ("num_descs %d too large\n", 
-                                msg->ibm_u.rdma.ibrm_num_descs);
-                        goto failed;
-                }
+        case IBNAL_MSG_PUT_REQ:
+                rx->rx_responded = 0;
+                lib_parse(&kibnal_lib, &msg->ibm_u.putreq.ibprm_hdr, rx);
+                if (rx->rx_responded)
+                        break;
 
-                if (flipped) {
-                        __swab32(msg->ibm_u.rdma.rd_key);
-                }
+                /* I wasn't asked to transfer any payload data.  This happens
+                 * if the PUT didn't match, or got truncated. */
+                kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
+                                       msg->ibm_u.putreq.ibprm_cookie);
+                break;
 
-                for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
-                        kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
+        case IBNAL_MSG_PUT_NAK:
+                CWARN ("PUT_NACK from "LPX64"\n", conn->ibc_peer->ibp_nid);
+                kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, 
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
 
-                        if (flipped) {
-                                __swab32(desc->rd_nob);
-                                __swab64(desc->rd_addr);
-                        }
+        case IBNAL_MSG_PUT_ACK:
+                spin_lock(&conn->ibc_lock);
+                tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
+                                                   msg->ibm_u.putack.ibpam_src_cookie);
+                if (tx != NULL)
+                        list_del(&tx->tx_list);
+                spin_unlock(&conn->ibc_lock);
 
-                        CDEBUG(D_NET, "  key %x, " "addr "LPX64", nob %u\n",
-                               msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob);
+                if (tx == NULL) {
+                        CERROR("Unmatched PUT_ACK from "LPX64"\n",
+                               conn->ibc_peer->ibp_nid);
+                        kibnal_close_conn(conn, -EPROTO);
+                        break;
                 }
+
+                LASSERT (tx->tx_waiting);
+                /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+                 * (a) I can overwrite tx_msg since my peer has received it!
+                 * (b) while tx_waiting is set, tx_complete() won't touch it.
+                 */
+
+                tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
+
+                rc = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, 
+                                      kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
+                                      &msg->ibm_u.putack.ibpam_rd,
+                                      msg->ibm_u.putack.ibpam_dst_cookie);
+                if (rc < 0)
+                        CERROR("Can't setup rdma for PUT to "LPX64": %d\n",
+                               conn->ibc_peer->ibp_nid, rc);
+
+                spin_lock(&conn->ibc_lock);
+                if (tx->tx_status == 0 && rc < 0)
+                        tx->tx_status = rc;
+                tx->tx_waiting = 0;             /* clear waiting and queue atomically */
+                kibnal_queue_tx_locked(tx, conn);
+                spin_unlock(&conn->ibc_lock);
                 break;
-                        
+                
         case IBNAL_MSG_PUT_DONE:
+                kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+
+        case IBNAL_MSG_GET_REQ:
+                rx->rx_responded = 0;
+                lib_parse(&kibnal_lib, &msg->ibm_u.get.ibgm_hdr, rx);
+                if (rx->rx_responded)           /* I responded to the GET_REQ */
+                        break;
+                /* NB GET didn't match (I'd have responded even with no payload
+                 * data) */
+                kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, -ENODATA,
+                                       msg->ibm_u.get.ibgm_cookie);
+                break;
+
         case IBNAL_MSG_GET_DONE:
-                if (nob < base_nob + sizeof (kib_completion_msg_t)) {
-                        CERROR ("Short COMPLETION msg from "LPX64": %d\n",
-                                conn->ibc_peer->ibp_nid, nob);
-                        goto failed;
-                }
-                if (flipped)
-                        __swab32s(&msg->ibm_u.completion.ibcm_status);
-                
-                CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
-                       msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
-                       msg->ibm_u.completion.ibcm_status);
-
-                kibnal_complete_passive_rdma (conn, 
-                                              msg->ibm_u.completion.ibcm_cookie,
-                                              msg->ibm_u.completion.ibcm_status);
-                kibnal_post_rx (rx, 1);
-                return;
-                        
-        default:
-                CERROR ("Can't parse type from "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, msg->ibm_type);
+                kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+        }
+
+        kibnal_post_rx(rx, 1);
+}
+
+void
+kibnal_rx_complete (kib_rx_t *rx, int nob, vv_comp_status_t vvrc)
+{
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+        unsigned long flags;
+        int           rc;
+
+        CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
+        LASSERT (rx->rx_posted);
+        rx->rx_posted = 0;
+
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+                goto ignore;
+
+        if (vvrc != vv_comp_status_success) {
+                CERROR("Rx from "LPX64" failed: %d\n", 
+                       conn->ibc_peer->ibp_nid, vvrc);
                 goto failed;
         }
 
-        /* schedule for kibnal_rx() in thread context */
-        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
-        
-        list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
-        wake_up (&kibnal_data.kib_sched_waitq);
-        
-        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+        rc = kibnal_unpack_msg(msg, nob);
+        if (rc != 0) {
+                CERROR ("Error %d unpacking rx from "LPX64"\n",
+                        rc, conn->ibc_peer->ibp_nid);
+                goto failed;
+        }
+
+        if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+            msg->ibm_srcstamp != conn->ibc_incarnation ||
+            msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
+            msg->ibm_dststamp != kibnal_data.kib_incarnation) {
+                CERROR ("Stale rx from "LPX64"\n",
+                        conn->ibc_peer->ibp_nid);
+                goto failed;
+        }
+
+        /* racing with connection establishment/teardown! */
 
+        if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+                /* must check holding global lock to eliminate race */
+                if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+                        list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+                        write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                                flags);
+                        return;
+                }
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                        flags);
+        }
+        kibnal_handle_rx(rx);
         return;
         
  failed:
         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
-        kibnal_close_conn(conn, -ECONNABORTED);
-
+        kibnal_close_conn(conn, -EIO);
+ ignore:
         /* Don't re-post rx & drop its ref on conn */
-        kibnal_put_conn(conn);
+        kibnal_conn_decref(conn);
 }
 
-static void
-kibnal_rx (kib_rx_t *rx)
+#if IBNAL_WHOLE_MEM
+int
+kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, 
+                     unsigned long page_offset, unsigned long len)
 {
-        kib_msg_t   *msg = rx->rx_msg;
+        kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
+        vv_l_key_t       l_key;
+        vv_r_key_t       r_key;
+        void            *addr;
+        void            *vaddr;
+        vv_mem_reg_h_t   mem_h;
+        vv_return_t      vvrc;
 
-        /* Clear flag so I can detect if I've sent an RDMA completion */
-        rx->rx_rdma = 0;
+        if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
+                CERROR ("Too many RDMA fragments\n");
+                return -EMSGSIZE;
+        }
 
-        switch (msg->ibm_type) {
-        case IBNAL_MSG_GET_RDMA:
-                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
-                /* If the incoming get was matched, I'll have initiated the
-                 * RDMA and the completion message... */
-                if (rx->rx_rdma)
-                        break;
+        addr = (void *)(((unsigned long)kmap(page)) + page_offset);
 
-                /* Otherwise, I'll send a failed completion now to prevent
-                 * the peer's GET blocking for the full timeout. */
-                CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
-                        rx->rx_conn->ibc_peer->ibp_nid);
-                kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
-                                          rx, NULL, 0, NULL, NULL, 0, 0);
-                break;
-                
-        case IBNAL_MSG_PUT_RDMA:
-                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
-                if (rx->rx_rdma)
-                        break;
-                /* This is most unusual, since even if lib_parse() didn't
-                 * match anything, it should have asked us to read (and
-                 * discard) the payload.  The portals header must be
-                 * inconsistent with this message type, so it's the
-                 * sender's fault for sending garbage and she can time
-                 * herself out... */
-                CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
-                        rx->rx_conn->ibc_peer->ibp_nid);
-                break;
+        vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, addr,
+                                    len, &mem_h, &l_key, &r_key);
+        LASSERT (vvrc == vv_return_ok);
 
-        case IBNAL_MSG_IMMEDIATE:
-                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
-                LASSERT (!rx->rx_rdma);
-                break;
-                
-        default:
-                LBUG();
-                break;
+        kunmap(page);
+
+        if (active) {
+                if (rd->rd_nfrag == 0) {
+                        rd->rd_key = l_key;
+                } else if (l_key != rd->rd_key) {
+                        CERROR ("> 1 key for single RDMA desc\n");
+                        return -EINVAL;
+                }
+                vaddr = addr;
+        } else {
+                if (rd->rd_nfrag == 0) {
+                        rd->rd_key = r_key;
+                } else if (r_key != rd->rd_key) {
+                        CERROR ("> 1 key for single RDMA desc\n");
+                        return -EINVAL;
+                }
+                vv_va2advertise_addr(kibnal_data.kib_hca, addr, &vaddr);
         }
 
-        kibnal_post_rx (rx, 1);
+        kibnal_rf_set(frag, (unsigned long)vaddr, len);
+
+        CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] %p\n", 
+               rd->rd_nfrag, frag->rf_nob, rd->rd_key, 
+               frag->rf_addr_hi, frag->rf_addr_lo, addr);
+
+        rd->rd_nfrag++;
+        return 0;
 }
 
-static struct page *
+struct page *
 kibnal_kvaddr_to_page (unsigned long vaddr)
 {
         struct page *page;
@@ -580,93 +568,26 @@ kibnal_kvaddr_to_page (unsigned long vaddr)
         else
                 page = virt_to_page (vaddr);
 
-        if (!VALID_PAGE (page))
-                page = NULL;
-
-        return page;
-}
-
-static void
-kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
-                 unsigned long len, int active)
-{
-        kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
-        kib_rdma_desc_t *desc;
-        vv_l_key_t l_key;
-        vv_r_key_t r_key;
-        void *addr;
-        vv_mem_reg_h_t mem_h;
-        vv_return_t retval;
-
-        LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", 
-                 ibrm->ibrm_num_descs);
-
-        desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
-
-        addr = page_address(page) + page_offset;
-
-        /* TODO: This next step is only needed to get either the lkey
-         * or the rkey. However they should be the same than for the
-         * tx buffer, so we might as well use it. */
-        retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
-                                      addr,
-                                      len,
-                                      &mem_h,
-                                      &l_key,
-                                      &r_key);
-        if (retval) {
-                CERROR("vv_get_gen_mr_attrib failed: %d", retval);
-                /* TODO: this shouldn't really fail, but what if? */
-                return;
-        }
-
-        if (active) {
-                ibrm->rd_key = l_key;
-        } else {
-                ibrm->rd_key = r_key;
-
-                vv_va2advertise_addr(kibnal_data.kib_hca, addr, &addr);
-        }
-
-        desc->rd_addr = (__u64)(unsigned long)addr;
-        desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
-
-        ibrm->ibrm_num_descs++;
+        return VALID_PAGE(page) ? page : NULL;
 }
 
-static int
-kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
-{
-        struct page *page;
-        int page_offset, len;
-
-        while (nob > 0) {
-                page = kibnal_kvaddr_to_page(vaddr);
-                if (page == NULL)
-                        return -EFAULT;
-
-                page_offset = vaddr & (PAGE_SIZE - 1);
-                len = min(nob, (int)PAGE_SIZE - page_offset);
-                
-                kibnal_fill_ibrm(tx, page, page_offset, len, active);
-                nob -= len;
-                vaddr += len;
-        }
-
-        return 0;
-}
-
-static int
-kibnal_map_iov (kib_tx_t *tx, vv_access_con_bit_mask_t access,
-                 int niov, struct iovec *iov, int offset, int nob, int active)
+int
+kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, 
+                    vv_access_con_bit_mask_t access,
+                    int niov, struct iovec *iov, int offset, int nob)
                  
 {
-        void   *vaddr;
-        vv_return_t retval;
+        /* active if I'm sending */
+        int           active = ((access & vv_acc_r_mem_write) == 0);
+        int           fragnob;
+        int           rc;
+        unsigned long vaddr;
+        struct page  *page;
+        int           page_offset;
 
         LASSERT (nob > 0);
         LASSERT (niov > 0);
-        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+        LASSERT ((rd != tx->tx_rd) == !active);
 
         while (offset >= iov->iov_len) {
                 offset -= iov->iov_len;
@@ -675,60 +596,154 @@ kibnal_map_iov (kib_tx_t *tx, vv_access_con_bit_mask_t access,
                 LASSERT (niov > 0);
         }
 
-        if (nob > iov->iov_len - offset) {
-                CERROR ("Can't map multiple vaddr fragments\n");
-                return (-EMSGSIZE);
-        }
+        rd->rd_nfrag = 0;
+        do {
+                LASSERT (niov > 0);
 
-        /* our large contiguous iov could be backed by multiple physical
-         * pages. */
-        if (kibnal_whole_mem()) {
-                int rc;
-                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
-                rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + 
-                                         offset, nob, active);
-                if (rc != 0) {
-                        CERROR ("Can't map iov: %d\n", rc);
+                vaddr = ((unsigned long)iov->iov_base) + offset;
+                page_offset = vaddr & (PAGE_SIZE - 1);
+                page = kibnal_kvaddr_to_page(vaddr);
+                if (page == NULL) {
+                        CERROR ("Can't find page\n");
+                        return -EFAULT;
+                }
+
+                fragnob = min((int)(iov->iov_len - offset), nob);
+                fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+                rc = kibnal_append_rdfrag(rd, active, page, 
+                                          page_offset, fragnob);
+                if (rc != 0)
                         return rc;
+
+                if (offset + fragnob < iov->iov_len) {
+                        offset += fragnob;
+                } else {
+                        offset = 0;
+                        iov++;
+                        niov--;
                 }
-                return 0;
+                nob -= fragnob;
+        } while (nob > 0);
+        
+        return 0;
+}
+
+int
+kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, 
+                      vv_access_con_bit_mask_t access,
+                      int nkiov, ptl_kiov_t *kiov, int offset, int nob)
+{
+        /* active if I'm sending */
+        int            active = ((access & vv_acc_r_mem_write) == 0);
+        int            fragnob;
+        int            rc;
+
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT ((rd != tx->tx_rd) == !active);
+
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
+        }
+
+        rd->rd_nfrag = 0;
+        do {
+                LASSERT (nkiov > 0);
+                fragnob = min((int)(kiov->kiov_len - offset), nob);
+                
+                rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
+                                          kiov->kiov_offset + offset,
+                                          fragnob);
+                if (rc != 0)
+                        return rc;
+
+                offset = 0;
+                kiov++;
+                nkiov--;
+                nob -= fragnob;
+        } while (nob > 0);
+
+        return 0;
+}
+#else
+int
+kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
+                     vv_access_con_bit_mask_t access,
+                     int niov, struct iovec *iov, int offset, int nob)
+                 
+{
+        /* active if I'm sending */
+        int         active = ((access & vv_acc_r_mem_write) == 0);
+        void       *vaddr;
+        vv_return_t vvrc;
+
+        LASSERT (nob > 0);
+        LASSERT (niov > 0);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+        LASSERT ((rd != tx->tx_rd) == !active);
+
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+                LASSERT (niov > 0);
+        }
+
+        if (nob > iov->iov_len - offset) {
+                CERROR ("Can't map multiple vaddr fragments\n");
+                return (-EMSGSIZE);
         }
 
         vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
         tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
 
-        retval = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob,
-                                   kibnal_data.kib_pd, access,
-                                   &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
-                                   &tx->tx_md.md_rkey);
-        if (retval != 0) {
-                CERROR ("Can't map vaddr %p: %d\n", vaddr, retval);
-                return -EINVAL;
+        vvrc = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob,
+                                      kibnal_data.kib_pd, access,
+                                      &tx->tx_md.md_handle, 
+                                      &tx->tx_md.md_lkey,
+                                      &tx->tx_md.md_rkey);
+        if (vvrc != vv_return_ok) {
+                CERROR ("Can't map vaddr %p: %d\n", vaddr, vvrc);
+                return -EFAULT;
         }
 
         tx->tx_mapped = KIB_TX_MAPPED;
+
+        rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
+        rd->rd_nfrag = 1;
+        kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
+        
         return (0);
 }
 
-static int
-kibnal_map_kiov (kib_tx_t *tx, vv_access_con_bit_mask_t access,
-                  int nkiov, ptl_kiov_t *kiov,
-                  int offset, int nob, int active)
+int
+kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
+                      vv_access_con_bit_mask_t access,
+                      int nkiov, ptl_kiov_t *kiov, int offset, int nob)
 {
+        /* active if I'm sending */
+        int            active = ((access & vv_acc_r_mem_write) == 0);
+        vv_return_t    vvrc;
         vv_phy_list_t  phys_pages;
-        vv_phy_buf_t  *phys_buf = NULL;
+        vv_phy_buf_t  *phys;
         int            page_offset;
         int            nphys;
         int            resid;
-        int            phys_size = 0;
-        int            i, rc = 0;
-        vv_return_t    retval;
+        int            phys_size;
+        int            rc;
 
         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 
         LASSERT (nob > 0);
         LASSERT (nkiov > 0);
         LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+        LASSERT ((rd != tx->tx_rd) == !active);
 
         while (offset >= kiov->kiov_len) {
                 offset -= kiov->kiov_len;
@@ -737,27 +752,19 @@ kibnal_map_kiov (kib_tx_t *tx, vv_access_con_bit_mask_t access,
                 LASSERT (nkiov > 0);
         }
 
-        page_offset = kiov->kiov_offset + offset;
-        nphys = 1;
-
-        if (!kibnal_whole_mem()) {
-                phys_size = nkiov * sizeof(vv_phy_buf_t);
-                PORTAL_ALLOC(phys_buf, phys_size);
-
-                if (phys_buf == NULL) {
-                        CERROR ("Can't allocate phys_buf\n");
-                        return (-ENOMEM);
-                }
+        phys_size = nkiov * sizeof (*phys);
+        PORTAL_ALLOC(phys, phys_size);
+        if (phys == NULL) {
+                CERROR ("Can't allocate tmp phys\n");
+                return (-ENOMEM);
+        }
 
-                phys_buf[0].start = kibnal_page2phys(kiov->kiov_page);
-                phys_buf[0].size = PAGE_SIZE;
+        page_offset = kiov->kiov_offset + offset;
 
-        } else {
-                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
-                kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, 
-                                 kiov->kiov_len, active);
-        }
+        phys[0].start = kibnal_page2phys(kiov->kiov_page);
+        phys[0].size = PAGE_SIZE;
 
+        nphys = 1;
         resid = nob - (kiov->kiov_len - offset);
 
         while (resid > 0) {
@@ -768,99 +775,73 @@ kibnal_map_kiov (kib_tx_t *tx, vv_access_con_bit_mask_t access,
                 if (kiov->kiov_offset != 0 ||
                     ((resid > PAGE_SIZE) && 
                      kiov->kiov_len < PAGE_SIZE)) {
+                        int i;
                         /* Can't have gaps */
                         CERROR ("Can't make payload contiguous in I/O VM:"
                                 "page %d, offset %d, len %d \n", nphys, 
                                 kiov->kiov_offset, kiov->kiov_len);
 
-                        for (i = -nphys; i < nkiov; i++) 
-                        {
+                        for (i = -nphys; i < nkiov; i++)
                                 CERROR("kiov[%d] %p +%d for %d\n",
-                                       i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
-                        }
+                                       i, kiov[i].kiov_page, 
+                                       kiov[i].kiov_offset, 
+                                       kiov[i].kiov_len);
                         
                         rc = -EINVAL;
                         goto out;
                 }
 
-                if (nphys == PTL_MD_MAX_IOV) {
-                        CERROR ("payload too big (%d)\n", nphys);
-                        rc = -EMSGSIZE;
-                        goto out;
-                }
-
-                if (!kibnal_whole_mem()) {
-                        LASSERT (nphys * sizeof (vv_phy_buf_t) < phys_size);
-                        phys_buf[nphys].start = kibnal_page2phys(kiov->kiov_page);
-                        phys_buf[nphys].size = PAGE_SIZE;
-
-                } else {
-                        if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
-                                CERROR ("payload too big (%d)\n", nphys);
-                                rc = -EMSGSIZE;
-                                goto out;
-                        }
-                        kibnal_fill_ibrm(tx, kiov->kiov_page, 
-                                         kiov->kiov_offset, kiov->kiov_len,
-                                         active);
-                }
+                LASSERT (nphys * sizeof (*phys) < phys_size);
+                phys[nphys].start = kibnal_page2phys(kiov->kiov_page);
+                phys[nphys].size = PAGE_SIZE;
 
-                nphys ++;
+                nphys++;
                 resid -= PAGE_SIZE;
         }
 
-        if (kibnal_whole_mem())
-                goto out;
-
 #if 0
         CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
         for (i = 0; i < nphys; i++)
                 CWARN ("   [%d] "LPX64"\n", i, phys[i]);
 #endif
 
-#if IBNAL_FMR
-#error "vibnal hasn't learned about FMR yet"
-        rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
-                                       phys_pages, nphys,
-                                       &tx->tx_md.md_addr,
-                                       page_offset,
-                                       &tx->tx_md.md_handle.fmr,
-                                       &tx->tx_md.md_lkey,
-                                       &tx->tx_md.md_rkey);
-#else
-        retval = vv_phy_mem_region_register(kibnal_data.kib_hca,
-                                            &phys_pages,
-                                            IBNAL_RDMA_BASE,
-                                            nphys,
-                                            0,          /* offset */
-                                            kibnal_data.kib_pd,
-                                            vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind, /* TODO: translated as-is, but seems incorrect or too much */
-                                            &tx->tx_md.md_handle,
-                                            &tx->tx_md.md_addr,
-                                            &tx->tx_md.md_lkey,
-                                            &tx->tx_md.md_rkey);
-#endif
-        if (retval == vv_return_ok) {
-                CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
-                       nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
-#if IBNAL_FMR
-                tx->tx_mapped = KIB_TX_MAPPED_FMR;
-#else
-                tx->tx_mapped = KIB_TX_MAPPED;
-#endif
-        } else {
-                CERROR ("Can't map phys_pages: %d\n", retval);
+        vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
+                                          &phys_pages,
+                                          IBNAL_RDMA_BASE,
+                                          nphys,
+                                          page_offset,
+                                          kibnal_data.kib_pd,
+                                          access,
+                                          &tx->tx_md.md_handle,
+                                          &tx->tx_md.md_addr,
+                                          &tx->tx_md.md_lkey,
+                                          &tx->tx_md.md_rkey);
+
+        if (vvrc != vv_return_ok) {
+                CERROR ("Can't map phys: %d\n", vvrc);
                 rc = -EFAULT;
+                goto out;
         }
 
- out:
-        if (phys_buf != NULL)
-                PORTAL_FREE(phys_buf, phys_size);
+        CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: "
+               "lkey %x, rkey %x, addr "LPX64"\n",
+               nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey,
+               tx->tx_md.md_addr);
+
+        tx->tx_mapped = KIB_TX_MAPPED;
+        rc = 0;
 
+        rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
+        rd->rd_nfrag = 1;
+        kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
+        
+ out:
+        PORTAL_FREE(phys, phys_size);
         return (rc);
 }
+#endif
 
-static kib_conn_t *
+kib_conn_t *
 kibnal_find_conn_locked (kib_peer_t *peer)
 {
         struct list_head *tmp;
@@ -876,108 +857,162 @@ kibnal_find_conn_locked (kib_peer_t *peer)
 void
 kibnal_check_sends (kib_conn_t *conn)
 {
-        unsigned long   flags;
         kib_tx_t       *tx;
+        vv_return_t     vvrc;                        
         int             rc;
         int             i;
         int             done;
-        int             nwork;
 
-        ENTRY;
-
-        spin_lock_irqsave (&conn->ibc_lock, flags);
+        /* Don't send anything until after the connection is established */
+        if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+                CDEBUG(D_NET, LPX64"too soon\n", conn->ibc_peer->ibp_nid);
+                return;
+        }
+        
+        spin_lock(&conn->ibc_lock);
 
         LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
 
         if (list_empty(&conn->ibc_tx_queue) &&
             conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
-                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                spin_unlock(&conn->ibc_lock);
                 
                 tx = kibnal_get_idle_tx(0);     /* don't block */
                 if (tx != NULL)
                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
 
-                spin_lock_irqsave(&conn->ibc_lock, flags);
+                spin_lock(&conn->ibc_lock);
                 
-                if (tx != NULL) {
-                        atomic_inc(&conn->ibc_refcount);
+                if (tx != NULL)
                         kibnal_queue_tx_locked(tx, conn);
-                }
         }
 
         while (!list_empty (&conn->ibc_tx_queue)) {
                 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
 
                 /* We rely on this for QP sizing */
-                LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+                LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
 
                 LASSERT (conn->ibc_outstanding_credits >= 0);
                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
                 LASSERT (conn->ibc_credits >= 0);
                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
 
-                /* Not on ibc_rdma_queue */
-                LASSERT (!tx->tx_passive_rdma_wait);
-
-                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
-                        GOTO(out, 0);
-
-                if (conn->ibc_credits == 0)     /* no credits */
-                        GOTO(out, 1);
+                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) {
+                        CDEBUG(D_NET, LPX64": posted enough\n",
+                               conn->ibc_peer->ibp_nid);
+                        break;
+                }
+                
+                if (conn->ibc_credits == 0) {   /* no credits */
+                        CDEBUG(D_NET, LPX64": no credits\n",
+                               conn->ibc_peer->ibp_nid);
+                        break;
+                }
                 
                 if (conn->ibc_credits == 1 &&   /* last credit reserved for */
-                    conn->ibc_outstanding_credits == 0) /* giving back credits */
-                        GOTO(out, 2);
-
+                    conn->ibc_outstanding_credits == 0) { /* giving back credits */
+                        CDEBUG(D_NET, LPX64": not using last credit\n",
+                               conn->ibc_peer->ibp_nid);
+                        break;
+                }
+                
                 list_del (&tx->tx_list);
 
                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
                     (!list_empty(&conn->ibc_tx_queue) ||
                      conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
                         /* redundant NOOP */
-                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                        spin_unlock(&conn->ibc_lock);
                         kibnal_tx_done(tx);
-                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        spin_lock(&conn->ibc_lock);
+                        CDEBUG(D_NET, LPX64": redundant noop\n",
+                               conn->ibc_peer->ibp_nid);
                         continue;
                 }
 
-                tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
-                conn->ibc_outstanding_credits = 0;
+                kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
+                                conn->ibc_peer->ibp_nid, conn->ibc_incarnation);
 
+                conn->ibc_outstanding_credits = 0;
                 conn->ibc_nsends_posted++;
                 conn->ibc_credits--;
 
-                /* we only get a tx completion for the final rdma op */ 
-                tx->tx_sending = 0;
-                tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+                /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+                 * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+                 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+                 * and then re-queued here.  It's (just) possible that
+                 * tx_sending is non-zero if we've not done the tx_complete() from
+                 * the first send; hence the += rather than = below. */
+                tx->tx_sending++;
+
                 list_add (&tx->tx_list, &conn->ibc_active_txs);
-#if IBNAL_CKSUM
-                tx->tx_msg->ibm_cksum = 0;
-                tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
-                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
-#endif
-                /* NB the gap between removing tx from the queue and sending it
-                 * allows message re-ordering to occur */
 
-                LASSERT (tx->tx_nsp > 0);
+                /* Keep holding ibc_lock while posting sends on this
+                 * connection; vv_post_send() isn't re-entrant on the same
+                 * QP!! */
+
+                LASSERT (tx->tx_nwrq > 0);
 
                 rc = -ECONNABORTED;
-                nwork = 0;
+                vvrc = vv_return_ok;
                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
-                        vv_return_t retval;                        
-
                         tx->tx_status = 0;
-                        rc = 0;
-
-                        retval = vv_post_send_list(kibnal_data.kib_hca, conn->ibc_qp, tx->tx_nsp, tx->tx_wrq, vv_operation_type_send_rc);
-
-                        if (retval != 0) {
-                                CERROR("post send failed with %d\n", retval);
-                                rc = -ECONNABORTED;
-                                break;
+#if 1
+                        vvrc = vv_post_send_list(kibnal_data.kib_hca,
+                                                 conn->ibc_qp,
+                                                 tx->tx_nwrq,
+                                                 tx->tx_wrq,
+                                                 vv_operation_type_send_rc);
+                        rc = (vvrc == vv_return_ok) ? 0 : -EIO;
+#else
+                        /* Only post 1 item at a time for now (so we know
+                         * exactly how many got posted successfully) */
+                        for (i = 0; i < tx->tx_nwrq; i++) {
+                                switch (tx->tx_wrq[i].wr_type) {
+                                case vv_wr_send:
+                                        CDEBUG(D_NET, "[%d]posting send [%d %x %p]%s: %x\n", 
+                                               i,
+                                               tx->tx_wrq[i].scatgat_list->length,
+                                               tx->tx_wrq[i].scatgat_list->l_key,
+                                               tx->tx_wrq[i].scatgat_list->v_address,
+                                               tx->tx_wrq[i].type.send.send_qp_type.rc_type.fance_indicator ?
+                                               "(fence)":"",
+                                               tx->tx_msg->ibm_type);
+                                        break;
+                                case vv_wr_rdma_write:
+                                        CDEBUG(D_NET, "[%d]posting PUT  [%d %x %p]->[%x "LPX64"]\n", 
+                                               i,
+                                               tx->tx_wrq[i].scatgat_list->length,
+                                               tx->tx_wrq[i].scatgat_list->l_key,
+                                               tx->tx_wrq[i].scatgat_list->v_address,
+                                               tx->tx_wrq[i].type.send.send_qp_type.rc_type.r_r_key,
+                                               tx->tx_wrq[i].type.send.send_qp_type.rc_type.r_addr);
+                                        break;
+                                case vv_wr_rdma_read:
+                                        CDEBUG(D_NET, "[%d]posting GET  [%d %x %p]->[%x "LPX64"]\n", 
+                                               i,
+                                               tx->tx_wrq[i].scatgat_list->length,
+                                               tx->tx_wrq[i].scatgat_list->l_key,
+                                               tx->tx_wrq[i].scatgat_list->v_address,
+                                               tx->tx_wrq[i].type.send.send_qp_type.rc_type.r_r_key,
+                                               tx->tx_wrq[i].type.send.send_qp_type.rc_type.r_addr);
+                                        break;
+                                default:
+                                        LBUG();
+                                }
+                                vvrc = vv_post_send(kibnal_data.kib_hca,
+                                                    conn->ibc_qp, 
+                                                    &tx->tx_wrq[i], 
+                                                    vv_operation_type_send_rc);
+                                CDEBUG(D_NET, LPX64": post %d/%d\n",
+                                       conn->ibc_peer->ibp_nid, i, tx->tx_nwrq);
+                                if (vvrc != vv_return_ok) {
+                                        rc = -EIO;
+                                        break;
+                                }
                         }
-                        
-                        tx->tx_sending = tx->tx_nsp;
+#endif
                 }
 
                 if (rc != 0) {
@@ -988,18 +1023,18 @@ kibnal_check_sends (kib_conn_t *conn)
                         conn->ibc_nsends_posted--;
 
                         tx->tx_status = rc;
-                        tx->tx_passive_rdma_wait = 0;
-
-                        /* TODO: I think this is buggy if vv_post_send_list failed. */
+                        tx->tx_waiting = 0;
+                        tx->tx_sending--;
+                        
                         done = (tx->tx_sending == 0);
                         if (done)
                                 list_del (&tx->tx_list);
                         
-                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        spin_unlock(&conn->ibc_lock);
                         
                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
                                 CERROR ("Error %d posting transmit to "LPX64"\n", 
-                                        rc, conn->ibc_peer->ibp_nid);
+                                        vvrc, conn->ibc_peer->ibp_nid);
                         else
                                 CDEBUG (D_NET, "Error %d posting transmit to "
                                         LPX64"\n", rc, conn->ibc_peer->ibp_nid);
@@ -1010,179 +1045,225 @@ kibnal_check_sends (kib_conn_t *conn)
                                 kibnal_tx_done (tx);
                         return;
                 }
-                
         }
 
-        EXIT;
-out:
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+        spin_unlock(&conn->ibc_lock);
 }
 
-static void
-kibnal_tx_callback (vv_wc_t *wc)
+void
+kibnal_tx_complete (kib_tx_t *tx, int final_send, vv_comp_status_t vvrc)
 {
-        kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->wr_id);
-        kib_conn_t   *conn;
-        unsigned long flags;
+        kib_tx_t     *tx = (kib_tx_t *)((unsigned long)wc->wr_id);
+        kib_conn_t   *conn = tx->tx_conn;
+        int           failed = (vvrc != vv_comp_status_success);
         int           idle;
 
-        conn = tx->tx_conn;
-        LASSERT (conn != NULL);
+        CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
+               tx->tx_sending, tx->tx_nwrq, wc->completion_status);
         LASSERT (tx->tx_sending != 0);
 
-        CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
-               tx->tx_sending, tx->tx_nsp, wc->completion_status);
+        if (failed &&
+            conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+                CERROR ("Tx completion to "LPX64" failed: %d\n", 
+                        conn->ibc_peer->ibp_nid, wc->completion_status);
+
+        /* I should only get RDMA notifications of errors */
+        LASSERT (final_send || failed);
 
-        spin_lock_irqsave(&conn->ibc_lock, flags);
+        spin_lock(&conn->ibc_lock);
 
         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
-         * gets to free it, which also drops its ref on 'conn'.  If it's
-         * not me, then I take an extra ref on conn so it can't disappear
-         * under me. */
+         * gets to free it, which also drops its ref on 'conn'. */
 
-        tx->tx_sending--;
+        if (final_send)                         /* this is the last work item */
+                tx->tx_sending--;
+
+        if (failed) {
+                tx->tx_waiting = 0;
+                tx->tx_status = -EIO;
+        }
+        
         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
-                (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
+               !tx->tx_waiting;                 /* Not waiting for peer */
         if (idle)
                 list_del(&tx->tx_list);
 
-        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-               atomic_read (&conn->ibc_refcount));
-        atomic_inc (&conn->ibc_refcount);
+        kibnal_conn_addref(conn);               /* 1 ref for me.... */
 
         if (tx->tx_sending == 0)
                 conn->ibc_nsends_posted--;
 
-        if (wc->completion_status != vv_comp_status_success &&
-            tx->tx_status == 0)
-                tx->tx_status = -ECONNABORTED;
-
-        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+        spin_unlock(&conn->ibc_lock);
 
         if (idle)
                 kibnal_tx_done (tx);
 
-        if (wc->completion_status != vv_comp_status_success) {
-                CERROR ("Tx completion to "LPX64" failed: %d\n", 
-                        conn->ibc_peer->ibp_nid, wc->completion_status);
-                kibnal_close_conn (conn, -ENETDOWN);
-        } else {
-                /* can I shovel some more sends out the door? */
+        if (failed)
+                kibnal_close_conn (conn, -EIO);
+        else
                 kibnal_check_sends(conn);
-        }
 
-        kibnal_put_conn (conn);
+        kibnal_conn_decref(conn);               /* ...until here */
 }
 
-void 
-kibnal_ca_async_callback(vv_event_record_t ev)
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
 {
-        /* XXX flesh out.  this seems largely for async errors */
-        CERROR("type: %d, port: %d, data: "LPX64"\n", ev.event_type, ev.port_num, ev.type.data);
+        vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
+        vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nwrq];
+        int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+
+        LASSERT (tx->tx_nwrq >= 0 && 
+                 tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
+        LASSERT (nob <= IBNAL_MSG_SIZE);
+
+        kibnal_init_msg(tx->tx_msg, type, body_nob);
+
+        *gl = (vv_scatgat_t) {
+                .v_address = (void *)((unsigned long)KIBNAL_TX_VADDR(tx)),
+                .l_key     = KIBNAL_TX_LKEY(tx),
+                .length    = nob,
+        };
+
+        memset(wrq, 0, sizeof(*wrq));
+
+        wrq->wr_id = (unsigned long)tx;
+        wrq->wr_type = vv_wr_send;
+        wrq->scatgat_list = gl;
+        wrq->num_of_data_segments = 1;
+        wrq->completion_notification = 1;
+        wrq->type.send.solicited_event = 1;
+        wrq->type.send.immidiate_data_indicator = 0;
+        wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
+        
+        tx->tx_nwrq++;
 }
 
-void
-kibnal_ca_callback (unsigned long unused_context)
+int
+kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
+                  kib_rdma_desc_t *dstrd, __u64 dstcookie)
 {
-        vv_wc_t wc;
-        int armed = 0;
-        vv_return_t retval;
+        /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
+        int              resid = nob;
+        kib_msg_t       *ibmsg = tx->tx_msg;
+        kib_rdma_desc_t *srcrd = tx->tx_rd;
+        kib_rdma_frag_t *srcfrag;
+        int              srcidx;
+        kib_rdma_frag_t *dstfrag;
+        int              dstidx;
+        vv_scatgat_t    *gl;
+        vv_wr_t         *wrq;
+        int              wrknob;
+        int              rc;
 
-        for(;;) {
+        /* Called by scheduler */
+        LASSERT (!in_interrupt());
 
-                while (vv_poll_for_completion(kibnal_data.kib_hca, kibnal_data.kib_cq, &wc) == vv_return_ok) {
+        LASSERT (type == IBNAL_MSG_GET_DONE ||
+                 type == IBNAL_MSG_PUT_DONE);
 
-                        /* We will need to rearm the CQ to avoid a potential race. */
-                        armed = 0;
+        srcidx = dstidx = 0;
+        srcfrag = &srcrd->rd_frags[0];
+        dstfrag = &dstrd->rd_frags[0];
+        rc = resid;
 
-                        if (kibnal_wreqid_is_rx(wc.wr_id))
-                                kibnal_rx_callback(&wc);
-                        else
-                                kibnal_tx_callback(&wc);
+        while (resid > 0) {
+                if (srcidx >= srcrd->rd_nfrag) {
+                        CERROR("Src buffer exhausted: %d frags\n", srcidx);
+                        rc = -EPROTO;
+                        break;
                 }
-
-                if (armed)
-                        return;
                 
-                retval = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event);
-                if (retval != 0) {
-                        CERROR ("Failed to re-arm completion queue: %d\n", retval);
-                        return;
+                if (dstidx == dstrd->rd_nfrag) {
+                        CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+                        rc = -EPROTO;
+                        break;
                 }
 
-                armed = 1;
-        }
-}
-
-void
-kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
-{
-        vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nsp];
-        vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nsp];
-        int           fence;
-        int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
-
-        LASSERT (tx->tx_nsp >= 0 && 
-                 tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
-        LASSERT (nob <= IBNAL_MSG_SIZE);
-        
-        tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
-        tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
-        tx->tx_msg->ibm_type = type;
-#if IBNAL_CKSUM
-        tx->tx_msg->ibm_nob = nob;
-#endif
-        /* Fence the message if it's bundled with an RDMA read */
-        fence = (tx->tx_nsp > 0) &&
-                (type == IBNAL_MSG_PUT_DONE);
+                if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
+                        CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
+                               srcidx, srcrd->rd_nfrag,
+                               dstidx, dstrd->rd_nfrag);
+                        rc = -EMSGSIZE;
+                        break;
+                }
 
-        *gl = (vv_scatgat_t) {
-                .v_address = (void *)tx->tx_msg,
-                .length    = nob,
-                .l_key     = tx->l_key,
-        };
+                wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
 
-        wrq->wr_id =  kibnal_ptr2wreqid(tx, 0);
-        wrq->completion_notification = 1;
-        wrq->scatgat_list = gl;
-        wrq->num_of_data_segments = 1;
-        wrq->wr_type = vv_wr_send;
+                gl = &tx->tx_gl[tx->tx_nwrq];
+                gl->v_address = (void *)((unsigned long)kibnal_rf_addr(srcfrag));
+                gl->length    = wrknob;
+                gl->l_key     = srcrd->rd_key;
 
-        wrq->type.send.solicited_event = 1;
+                wrq = &tx->tx_wrq[tx->tx_nwrq];
+                wrq->wr_id = (unsigned long)tx;
+                /* All frags give completion until we've sussed how to submit
+                 * all frags + completion message and only (but reliably) get
+                 * notification on the completion message */
+                wrq->completion_notification = 0;
+                wrq->scatgat_list = gl;
+                wrq->num_of_data_segments = 1;
+                wrq->wr_type = vv_wr_rdma_write;
+                wrq->type.send.solicited_event = 0;
+                wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
+                wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
+                wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
+
+                resid -= wrknob;
+                if (wrknob < srcfrag->rf_nob) {
+                        kibnal_rf_set(srcfrag, 
+                                      kibnal_rf_addr(srcfrag) + resid, 
+                                      srcfrag->rf_nob - wrknob);
+                } else {
+                        srcfrag++;
+                        srcidx++;
+                }
+                
+                if (wrknob < dstfrag->rf_nob) {
+                        kibnal_rf_set(dstfrag,
+                                      kibnal_rf_addr(dstfrag) + resid,
+                                      dstfrag->rf_nob - wrknob);
+                } else {
+                        dstfrag++;
+                        dstidx++;
+                }
+                
+                tx->tx_nwrq++;
+        }
 
-        wrq->type.send.send_qp_type.rc_type.fance_indicator = fence;
+        if (rc < 0)                             /* no RDMA if completing with failure */
+                tx->tx_nwrq = 0;
+        
+        ibmsg->ibm_u.completion.ibcm_status = rc;
+        ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
 
-        tx->tx_nsp++;
+        return rc;
 }
 
-static void
+void
 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
 {
-        unsigned long         flags;
-
-        spin_lock_irqsave(&conn->ibc_lock, flags);
-
+        spin_lock(&conn->ibc_lock);
         kibnal_queue_tx_locked (tx, conn);
-        
-        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+        spin_unlock(&conn->ibc_lock);
         
         kibnal_check_sends(conn);
 }
 
-static void
+void
 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
 {
-        unsigned long    flags;
         kib_peer_t      *peer;
         kib_conn_t      *conn;
+        unsigned long    flags;
         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
 
         /* If I get here, I've committed to send, so I complete the tx with
          * failure on any problems */
         
         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
-        LASSERT (tx->tx_nsp > 0);               /* work items have been set up */
+        LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
 
         read_lock_irqsave(g_lock, flags);
         
@@ -1196,13 +1277,11 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
 
         conn = kibnal_find_conn_locked (peer);
         if (conn != NULL) {
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                kibnal_conn_addref(conn);       /* 1 ref for me... */
                 read_unlock_irqrestore(g_lock, flags);
                 
                 kibnal_queue_tx (tx, conn);
+                kibnal_conn_decref(conn);       /* ...to here */
                 return;
         }
         
@@ -1212,7 +1291,7 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
 
         peer = kibnal_find_peer_locked (nid);
         if (peer == NULL) {
-                write_unlock_irqrestore (g_lock, flags);
+                write_unlock_irqrestore(g_lock, flags);
                 tx->tx_status = -EHOSTUNREACH;
                 kibnal_tx_done (tx);
                 return;
@@ -1221,328 +1300,84 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
         conn = kibnal_find_conn_locked (peer);
         if (conn != NULL) {
                 /* Connection exists; queue message on it */
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
-                write_unlock_irqrestore (g_lock, flags);
+                kibnal_conn_addref(conn);       /* 1 ref for me... */
+                write_unlock_irqrestore(g_lock, flags);
                 
                 kibnal_queue_tx (tx, conn);
+                kibnal_conn_decref(conn);       /* ...until here */
                 return;
         }
 
         if (peer->ibp_connecting == 0) {
                 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
-                        write_unlock_irqrestore (g_lock, flags);
+                        write_unlock_irqrestore(g_lock, flags);
                         tx->tx_status = -EHOSTUNREACH;
                         kibnal_tx_done (tx);
                         return;
                 }
         
                 peer->ibp_connecting = 1;
-
-                kib_peer_addref(peer); /* extra ref for connd */
+                kibnal_peer_addref(peer); /* extra ref for connd */
         
-                spin_lock (&kibnal_data.kib_connd_lock);
+                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
         
                 list_add_tail (&peer->ibp_connd_list,
                                &kibnal_data.kib_connd_peers);
                 wake_up (&kibnal_data.kib_connd_waitq);
         
-                spin_unlock (&kibnal_data.kib_connd_lock);
+                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
         }
         
         /* A connection is being established; queue the message... */
         list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
 
-        write_unlock_irqrestore (g_lock, flags);
+        write_unlock_irqrestore(g_lock, flags);
 }
 
-static ptl_err_t
-kibnal_start_passive_rdma (int type, ptl_nid_t nid,
-                            lib_msg_t *libmsg, ptl_hdr_t *hdr)
+int
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if kibnal_get_peer (nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->libnal_ni.ni_pid.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+ptl_err_t
+kibnal_sendmsg(lib_nal_t    *nal, 
+               void         *private,
+               lib_msg_t    *libmsg,
+               ptl_hdr_t    *hdr, 
+               int           type, 
+               ptl_nid_t     nid, 
+               ptl_pid_t     pid,
+               unsigned int  payload_niov, 
+               struct iovec *payload_iov, 
+               ptl_kiov_t   *payload_kiov,
+               int           payload_offset,
+               int           payload_nob)
 {
-        int         nob = libmsg->md->length;
-        kib_tx_t   *tx;
         kib_msg_t  *ibmsg;
+        kib_tx_t   *tx;
+        int         nob;
         int         rc;
-        vv_access_con_bit_mask_t access;
-        
-        LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
-        LASSERT (nob > 0);
-        LASSERT (!in_interrupt());              /* Mapping could block */
+        int         n;
 
-        access = vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind;
+        /* NB 'private' is different depending on what we're sending.... */
 
-        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
-        LASSERT (tx != NULL);
+        CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64
+               " pid %d\n", payload_nob, payload_niov, nid , pid);
 
-        if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
-                rc = kibnal_map_iov (tx, access,
-                                     libmsg->md->md_niov,
-                                     libmsg->md->md_iov.iov,
-                                     0, nob, 0);
-        else
-                rc = kibnal_map_kiov (tx, access,
-                                      libmsg->md->md_niov, 
-                                      libmsg->md->md_iov.kiov,
-                                      0, nob, 0);
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
 
-        if (rc != 0) {
-                CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
-                goto failed;
-        }
-        
-        if (type == IBNAL_MSG_GET_RDMA) {
-                /* reply gets finalized when tx completes */
-                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, 
-                                                        nid, libmsg);
-                if (tx->tx_libmsg[1] == NULL) {
-                        CERROR ("Can't create reply for GET -> "LPX64"\n",
-                                nid);
-                        rc = -ENOMEM;
-                        goto failed;
-                }
-        }
-        
-        tx->tx_passive_rdma = 1;
-
-        ibmsg = tx->tx_msg;
-
-        ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
-        ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
-        /* map_kiov alrady filled the rdma descs for the whole_mem case */
-        if (!kibnal_whole_mem()) {
-                ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey;
-                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
-                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
-                ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
-        }
-
-        kibnal_init_tx_msg (tx, type, 
-                            kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
-
-        CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
-               LPX64", nob %d\n",
-               tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
-               tx->tx_md.md_addr, nob);
-        
-        /* libmsg gets finalized when tx completes. */
-        tx->tx_libmsg[0] = libmsg;
-
-        kibnal_launch_tx(tx, nid);
-        return (PTL_OK);
-
- failed:
-        tx->tx_status = rc;
-        kibnal_tx_done (tx);
-        return (PTL_FAIL);
-}
-
-void
-kibnal_start_active_rdma (int type, int status,
-                           kib_rx_t *rx, lib_msg_t *libmsg, 
-                           unsigned int niov,
-                           struct iovec *iov, ptl_kiov_t *kiov,
-                           size_t offset, size_t nob)
-{
-        kib_msg_t    *rxmsg = rx->rx_msg;
-        kib_msg_t    *txmsg;
-        kib_tx_t     *tx;
-        vv_access_con_bit_mask_t access;
-        vv_wr_operation_t rdma_op;
-        int           rc;
-        __u32         i;
-
-        CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
-               type, status, niov, offset, nob);
-
-        /* Called by scheduler */
-        LASSERT (!in_interrupt ());
-
-        /* Either all pages or all vaddrs */
-        LASSERT (!(kiov != NULL && iov != NULL));
-
-        /* No data if we're completing with failure */
-        LASSERT (status == 0 || nob == 0);
-
-        LASSERT (type == IBNAL_MSG_GET_DONE ||
-                 type == IBNAL_MSG_PUT_DONE);
-
-        /* Flag I'm completing the RDMA.  Even if I fail to send the
-         * completion message, I will have tried my best so further
-         * attempts shouldn't be tried. */
-        LASSERT (!rx->rx_rdma);
-        rx->rx_rdma = 1;
-
-        if (type == IBNAL_MSG_GET_DONE) {
-                access = 0;
-                rdma_op  = vv_wr_rdma_write;
-                LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
-        } else {
-                access = vv_acc_l_mem_write;
-                rdma_op  = vv_wr_rdma_read;
-                LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
-        }
-
-        tx = kibnal_get_idle_tx (0);           /* Mustn't block */
-        if (tx == NULL) {
-                CERROR ("tx descs exhausted on RDMA from "LPX64
-                        " completing locally with failure\n",
-                        rx->rx_conn->ibc_peer->ibp_nid);
-                lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
-                return;
-        }
-        LASSERT (tx->tx_nsp == 0);
-
-        if (nob == 0) 
-                GOTO(init_tx, 0);
-
-        /* We actually need to transfer some data (the transfer
-         * size could get truncated to zero when the incoming
-         * message is matched) */
-        if (kiov != NULL)
-                rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
-        else
-                rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
-        
-        if (rc != 0) {
-                CERROR ("Can't map RDMA -> "LPX64": %d\n", 
-                        rx->rx_conn->ibc_peer->ibp_nid, rc);
-                /* We'll skip the RDMA and complete with failure. */
-                status = rc;
-                nob = 0;
-                GOTO(init_tx, rc);
-        } 
-
-        if (!kibnal_whole_mem()) {
-                tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey;
-                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
-                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
-                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
-        }
-
-        /* XXX ugh.  different page-sized hosts. */ 
-        if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
-            rxmsg->ibm_u.rdma.ibrm_num_descs) {
-                CERROR("tx descs (%u) != rx descs (%u)\n", 
-                       tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
-                       rxmsg->ibm_u.rdma.ibrm_num_descs);
-                /* We'll skip the RDMA and complete with failure. */
-                status = rc;
-                nob = 0;
-                GOTO(init_tx, rc);
-        }
-
-        /* map_kiov filled in the rdma descs which describe our side of the
-         * rdma transfer. */
-        /* ibrm_num_descs was verified in rx_callback */
-        for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
-                kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
-                vv_scatgat_t *ds = &tx->tx_gl[i];
-                vv_wr_t *wrq = &tx->tx_wrq[i];
-
-                ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
-                rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
-
-                ds->v_address = (void *)(unsigned long)ldesc->rd_addr;
-                ds->length    = ldesc->rd_nob;
-                ds->l_key     = tx->tx_msg->ibm_u.rdma.rd_key;
-
-                wrq->wr_id = kibnal_ptr2wreqid(tx, 0);
-
-#if 0
-                /* only the last rdma post triggers tx completion */
-                if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
-                        wrq->completion_notification = 1;
-                else
-                        wrq->completion_notification = 0;
-
-#else
-                /* TODO: hack. Right now complete everything, else the
-                 * driver will deadlock. This is less efficient than
-                 * requestion a notification for only a few of the
-                 * WQE. */
-                wrq->completion_notification = 1;
-#endif
-
-                wrq->scatgat_list = ds;
-                wrq->num_of_data_segments = 1;
-                wrq->wr_type = rdma_op;
-
-                wrq->type.send.solicited_event = 0;
-
-                wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
-                wrq->type.send.send_qp_type.rc_type.r_addr = rdesc->rd_addr;
-                wrq->type.send.send_qp_type.rc_type.r_r_key = rxmsg->ibm_u.rdma.rd_key;
-
-                CDEBUG(D_NET, "prepared RDMA with r_addr=%llx r_key=%x\n",
-                       wrq->type.send.send_qp_type.rc_type.r_addr,
-                       wrq->type.send.send_qp_type.rc_type.r_r_key);
-
-                tx->tx_nsp++;
-        }
-
-init_tx:
-        txmsg = tx->tx_msg;
-
-        txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
-        txmsg->ibm_u.completion.ibcm_status = status;
-        
-        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
-
-        if (status == 0 && nob != 0) {
-                LASSERT (tx->tx_nsp > 1);
-                /* RDMA: libmsg gets finalized when the tx completes.  This
-                 * is after the completion message has been sent, which in
-                 * turn is after the RDMA has finished. */
-                tx->tx_libmsg[0] = libmsg;
-        } else {
-                LASSERT (tx->tx_nsp == 1);
-                /* No RDMA: local completion happens now! */
-                CDEBUG(D_WARNING,"No data: immediate completion\n");
-                lib_finalize (&kibnal_lib, NULL, libmsg,
-                              status == 0 ? PTL_OK : PTL_FAIL);
-        }
-
-        /* +1 ref for this tx... */
-        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-               rx->rx_conn, rx->rx_conn->ibc_state, 
-               rx->rx_conn->ibc_peer->ibp_nid,
-               atomic_read (&rx->rx_conn->ibc_refcount));
-        atomic_inc (&rx->rx_conn->ibc_refcount);
-        /* ...and queue it up */
-        kibnal_queue_tx(tx, rx->rx_conn);
-}
-
-static ptl_err_t
-kibnal_sendmsg(lib_nal_t    *nal, 
-                void         *private,
-                lib_msg_t    *libmsg,
-                ptl_hdr_t    *hdr, 
-                int           type, 
-                ptl_nid_t     nid, 
-                ptl_pid_t     pid,
-                unsigned int  payload_niov, 
-                struct iovec *payload_iov, 
-                ptl_kiov_t   *payload_kiov,
-                size_t        payload_offset,
-                size_t        payload_nob)
-{
-        kib_msg_t  *ibmsg;
-        kib_tx_t   *tx;
-        int         nob;
-
-        /* NB 'private' is different depending on what we're sending.... */
-
-        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
-               " pid %d\n", payload_nob, payload_niov, nid , pid);
-
-        LASSERT (payload_nob == 0 || payload_niov > 0);
-        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
-
-        /* Thread context if we're sending payload */
-        LASSERT (!in_interrupt() || payload_niov == 0);
+        /* Thread context */
+        LASSERT (!in_interrupt());
         /* payload is either all vaddrs or all pages */
         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
 
@@ -1555,61 +1390,166 @@ kibnal_sendmsg(lib_nal_t    *nal,
                 /* reply's 'private' is the incoming receive */
                 kib_rx_t *rx = private;
 
-                /* RDMA reply expected? */
-                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
-                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
-                                                 rx, libmsg, payload_niov, 
-                                                 payload_iov, payload_kiov,
+                LASSERT(rx != NULL);
+
+                if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) {
+                        /* RDMA not expected */
+                        nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                        if (nob > IBNAL_MSG_SIZE) {
+                                CERROR("REPLY for "LPX64" too big (RDMA not requested):"
+                                       "%d (max for message is %d)\n", 
+                                       nid, payload_nob, IBNAL_MSG_SIZE);
+                                CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
+                                       nob, nid);
+                                return PTL_FAIL;
+                        }
+                        break;
+                }
+
+                /* Incoming message consistent with RDMA? */
+                if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
+                        CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
+                               nid, rx->rx_msg->ibm_type);
+                        return PTL_FAIL;
+                }
+
+                /* NB rx_complete() will send GET_NAK when I return to it from
+                 * here, unless I set rx_responded! */
+
+                tx = kibnal_get_idle_tx(0);
+                if (tx == NULL) {
+                        CERROR("Can't get tx for REPLY to "LPX64"\n", nid);
+                        return PTL_FAIL;
+                }
+
+                if (payload_nob == 0)
+                        rc = 0;
+                else if (payload_kiov == NULL)
+                        rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, 
+                                                 payload_niov, payload_iov, 
                                                  payload_offset, payload_nob);
-                        return (PTL_OK);
+                else
+                        rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
+                                                  payload_niov, payload_kiov,
+                                                  payload_offset, payload_nob);
+                if (rc != 0) {
+                        CERROR("Can't setup GET src for "LPX64": %d\n", nid, rc);
+                        kibnal_tx_done(tx);
+                        return PTL_FAIL;
                 }
                 
-                /* Incoming message consistent with immediate reply? */
-                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
-                        CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
-                                nid, rx->rx_msg->ibm_type);
-                        return (PTL_FAIL);
+                rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob,
+                                      &rx->rx_msg->ibm_u.get.ibgm_rd,
+                                      rx->rx_msg->ibm_u.get.ibgm_cookie);
+                if (rc < 0) {
+                        CERROR("Can't setup rdma for GET from "LPX64": %d\n", 
+                               nid, rc);
+                } else if (rc == 0) {
+                        /* No RDMA: local completion may happen now! */
+                        lib_finalize (&kibnal_lib, NULL, libmsg, PTL_OK);
+                } else {
+                        /* RDMA: lib_finalize(libmsg) when it completes */
+                        tx->tx_libmsg[0] = libmsg;
                 }
 
-                /* Will it fit in a message? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                if (nob > IBNAL_MSG_SIZE) {
-                        CERROR("REPLY for "LPX64" too big (RDMA not requested): %d (max for message is %d)\n", 
-                               nid, payload_nob, IBNAL_MSG_SIZE);
-                        return (PTL_FAIL);
-                }
-                break;
+                kibnal_queue_tx(tx, rx->rx_conn);
+                rx->rx_responded = 1;
+                return (rc >= 0) ? PTL_OK : PTL_FAIL;
         }
 
         case PTL_MSG_GET:
-                /* might the REPLY message be big enough to need RDMA? */
+                /* will the REPLY message be small enough not to need RDMA? */
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
-                if (nob > IBNAL_MSG_SIZE)
-                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
-                                                          nid, libmsg, hdr));
-                break;
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;
+
+                tx = kibnal_get_idle_tx(1);     /* may block; caller is an app thread */
+                LASSERT (tx != NULL);
+
+                ibmsg = tx->tx_msg;
+                ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+                ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+
+                if ((libmsg->md->options & PTL_MD_KIOV) == 0)
+                        rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
+                                                 vv_acc_r_mem_write,
+                                                 libmsg->md->md_niov,
+                                                 libmsg->md->md_iov.iov,
+                                                 0, libmsg->md->length);
+                else
+                        rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
+                                                  vv_acc_r_mem_write,
+                                                  libmsg->md->md_niov,
+                                                  libmsg->md->md_iov.kiov,
+                                                  0, libmsg->md->length);
+                if (rc != 0) {
+                        CERROR("Can't setup GET sink for "LPX64": %d\n", nid, rc);
+                        kibnal_tx_done(tx);
+                        return PTL_FAIL;
+                }
+
+                n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
+                nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
+                kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
+
+                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg);
+                if (tx->tx_libmsg[1] == NULL) {
+                        CERROR("Can't create reply for GET -> "LPX64"\n", nid);
+                        kibnal_tx_done(tx);
+                        return PTL_FAIL;
+                }
+
+                tx->tx_libmsg[0] = libmsg;      /* finalise libmsg[0,1] on completion */
+                tx->tx_waiting = 1;             /* waiting for GET_DONE */
+                kibnal_launch_tx(tx, nid);
+                return PTL_OK;
 
         case PTL_MSG_ACK:
                 LASSERT (payload_nob == 0);
                 break;
 
         case PTL_MSG_PUT:
-                /* Is the payload big enough to need RDMA? */
+                /* Is the payload small enough not to need RDMA? */
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                if (nob > IBNAL_MSG_SIZE)
-                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
-                                                          nid, libmsg, hdr));
-                
-                break;
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;
+
+                tx = kibnal_get_idle_tx(1);     /* may block: caller is app thread */
+                LASSERT (tx != NULL);
+
+                if (payload_kiov == NULL)
+                        rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
+                                                 payload_niov, payload_iov,
+                                                 payload_offset, payload_nob);
+                else
+                        rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
+                                                  payload_niov, payload_kiov,
+                                                  payload_offset, payload_nob);
+                if (rc != 0) {
+                        CERROR("Can't setup PUT src for "LPX64": %d\n", nid, rc);
+                        kibnal_tx_done(tx);
+                        return PTL_FAIL;
+                }
+
+                ibmsg = tx->tx_msg;
+                ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+                ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+                kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+                tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
+                kibnal_launch_tx(tx, nid);
+                return PTL_OK;
         }
 
+        LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+                 <= IBNAL_MSG_SIZE);
+
         tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
-                                  type == PTL_MSG_REPLY ||
-                                  in_interrupt()));
+                                  type == PTL_MSG_REPLY));
         if (tx == NULL) {
-                CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
-                        type, nid, in_interrupt() ? " (intr)" : "");
-                return (PTL_NO_SPACE);
+                CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", type, nid);
+                return PTL_NO_SPACE;
         }
 
         ibmsg = tx->tx_msg;
@@ -1626,18 +1566,15 @@ kibnal_sendmsg(lib_nal_t    *nal,
                                          payload_offset, payload_nob);
         }
 
-        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
-                            offsetof(kib_immediate_msg_t, 
-                                     ibim_payload[payload_nob]));
-
-        /* libmsg gets finalized when tx completes */
-        tx->tx_libmsg[0] = libmsg;
+        nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
 
+        tx->tx_libmsg[0] = libmsg;              /* finalise libmsg on completion */
         kibnal_launch_tx(tx, nid);
-        return (PTL_OK);
+        return PTL_OK;
 }
 
-static ptl_err_t
+ptl_err_t
 kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
                ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                unsigned int payload_niov, struct iovec *payload_iov,
@@ -1651,7 +1588,7 @@ kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
                                payload_offset, payload_len));
 }
 
-static ptl_err_t
+ptl_err_t
 kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
                      ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
                      unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
@@ -1663,28 +1600,33 @@ kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
                                payload_offset, payload_len));
 }
 
-static ptl_err_t
+ptl_err_t
 kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
                  unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
-                 size_t offset, size_t mlen, size_t rlen)
+                 size_t offset, int mlen, int rlen)
 {
         kib_rx_t    *rx = private;
         kib_msg_t   *rxmsg = rx->rx_msg;
-        int          msg_nob;
+        kib_conn_t  *conn = rx->rx_conn;
+        kib_tx_t    *tx;
+        kib_msg_t   *txmsg;
+        int          nob;
+        int          rc;
+        int          n;
         
         LASSERT (mlen <= rlen);
-        LASSERT (!in_interrupt ());
+        LASSERT (mlen >= 0);
+        LASSERT (!in_interrupt());
         /* Either all pages or all vaddrs */
         LASSERT (!(kiov != NULL && iov != NULL));
 
         switch (rxmsg->ibm_type) {
         default:
                 LBUG();
-                return (PTL_FAIL);
                 
         case IBNAL_MSG_IMMEDIATE:
-                msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
-                if (msg_nob > IBNAL_MSG_SIZE) {
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (nob > IBNAL_MSG_SIZE) {
                         CERROR ("Immediate message from "LPX64" too big: %d\n",
                                 rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
                         return (PTL_FAIL);
@@ -1702,22 +1644,65 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
                 lib_finalize (nal, NULL, libmsg, PTL_OK);
                 return (PTL_OK);
 
-        case IBNAL_MSG_GET_RDMA:
+        case IBNAL_MSG_PUT_REQ:
+                /* NB rx_complete() will send PUT_NAK when I return to it from
+                 * here, unless I set rx_responded!  */
+
+                if (mlen == 0) { /* No payload to RDMA */
+                        lib_finalize(nal, NULL, libmsg, PTL_OK);
+                        return PTL_OK;
+                }
+
+                tx = kibnal_get_idle_tx(0);
+                if (tx == NULL) {
+                        CERROR("Can't allocate tx for "LPX64"\n",
+                               conn->ibc_peer->ibp_nid);
+                        return PTL_FAIL;
+                }
+
+                txmsg = tx->tx_msg;
+                if (kiov == NULL)
+                        rc = kibnal_setup_rd_iov(tx, 
+                                                 &txmsg->ibm_u.putack.ibpam_rd,
+                                                 vv_acc_r_mem_write,
+                                                 niov, iov, offset, mlen);
+                else
+                        rc = kibnal_setup_rd_kiov(tx,
+                                                  &txmsg->ibm_u.putack.ibpam_rd,
+                                                  vv_acc_r_mem_write,
+                                                  niov, kiov, offset, mlen);
+                if (rc != 0) {
+                        CERROR("Can't setup PUT sink for "LPX64": %d\n",
+                               conn->ibc_peer->ibp_nid, rc);
+                        kibnal_tx_done(tx);
+                        return PTL_FAIL;
+                }
+
+                txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+                txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+                n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
+                nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+                kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
+
+                tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_DONE */
+                kibnal_queue_tx(tx, conn);
+
+                LASSERT (!rx->rx_responded);
+                rx->rx_responded = 1;
+                return PTL_OK;
+
+        case IBNAL_MSG_GET_REQ:
                 /* We get called here just to discard any junk after the
                  * GET hdr. */
                 LASSERT (libmsg == NULL);
                 lib_finalize (nal, NULL, libmsg, PTL_OK);
                 return (PTL_OK);
-
-        case IBNAL_MSG_PUT_RDMA:
-                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
-                                          rx, libmsg, 
-                                          niov, iov, kiov, offset, mlen);
-                return (PTL_OK);
         }
 }
 
-static ptl_err_t
+ptl_err_t
 kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
               unsigned int niov, struct iovec *iov, 
               size_t offset, size_t mlen, size_t rlen)
@@ -1726,7 +1711,7 @@ kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
                                 offset, mlen, rlen));
 }
 
-static ptl_err_t
+ptl_err_t
 kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
                      unsigned int niov, ptl_kiov_t *kiov, 
                      size_t offset, size_t mlen, size_t rlen)
@@ -1735,14 +1720,6 @@ kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
                                 offset, mlen, rlen));
 }
 
-/*****************************************************************************
- * the rest of this file concerns connection management.  active connetions
- * start with connect_peer, passive connections start with passive_callback.
- * active disconnects start with conn_close, cm_callback starts passive
- * disconnects and contains the guts of how the disconnect state machine
- * progresses. 
- *****************************************************************************/
-
 int
 kibnal_thread_start (int (*fn)(void *arg), void *arg)
 {
@@ -1755,43 +1732,36 @@ kibnal_thread_start (int (*fn)(void *arg), void *arg)
         return (0);
 }
 
-static void
+void
 kibnal_thread_fini (void)
 {
         atomic_dec (&kibnal_data.kib_nthreads);
 }
 
-/* this can be called by anyone at any time to close a connection.  if
- * the connection is still established it heads to the connd to start
- * the disconnection in a safe context.  It has no effect if called
- * on a connection that is already disconnecting */
 void
 kibnal_close_conn_locked (kib_conn_t *conn, int error)
 {
-        /* This just does the immmediate housekeeping, and schedules the
-         * connection for the connd to finish off.
+        /* This just does the immmediate housekeeping.  'error' is zero for a
+         * normal shutdown which can happen only after the connection has been
+         * established.  If the connection is established, schedule the
+         * connection to be finished off by the connd.  Otherwise the connd is
+         * already dealing with it (either to set it up or tear it down).
          * Caller holds kib_global_lock exclusively in irq context */
         kib_peer_t   *peer = conn->ibc_peer;
 
-        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
-                                    IBNAL_CONN_DISCONNECTED);
+        LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 
-        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
-                return; /* already disconnecting */
+        if (error != 0 && conn->ibc_comms_error == 0)
+                conn->ibc_comms_error = error;
+
+        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+                return; /* already being handled  */
 
         CDEBUG (error == 0 ? D_NET : D_ERROR,
                 "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
 
-        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
-                /* kib_connd_conns takes ibc_list's ref */
-                list_del (&conn->ibc_list);
-        } else {
-                /* new ref for kib_connd_conns */
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount);
-        }
+        /* kib_connd_conns takes ibc_list's ref */
+        list_del (&conn->ibc_list);
         
         if (list_empty (&peer->ibp_conns) &&
             peer->ibp_persistence == 0) {
@@ -1799,45 +1769,139 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
                 kibnal_unlink_peer_locked (peer);
         }
 
-        conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+        kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
 
-        spin_lock (&kibnal_data.kib_connd_lock);
+        spin_lock(&kibnal_data.kib_connd_lock);
 
         list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
         wake_up (&kibnal_data.kib_connd_waitq);
                 
-        spin_unlock (&kibnal_data.kib_connd_lock);
+        spin_unlock(&kibnal_data.kib_connd_lock);
 }
 
 void
 kibnal_close_conn (kib_conn_t *conn, int error)
 {
-        unsigned long     flags;
-
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        unsigned long flags;
+        
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
         kibnal_close_conn_locked (conn, error);
         
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+}
+
+void
+kibnal_handle_early_rxs(kib_conn_t *conn)
+{
+        unsigned long    flags;
+        kib_rx_t        *rx;
+
+        LASSERT (!in_interrupt());
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+        
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        while (!list_empty(&conn->ibc_early_rxs)) {
+                rx = list_entry(conn->ibc_early_rxs.next,
+                                kib_rx_t, rx_list);
+                list_del(&rx->rx_list);
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+                
+                kibnal_handle_rx(rx);
+                
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        }
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+}
+
+void
+kibnal_conn_disconnected(kib_conn_t *conn)
+{
+        LIST_HEAD        (zombies); 
+        struct list_head *tmp;
+        struct list_head *nxt;
+        kib_tx_t         *tx;
+
+        /* I'm the connd */
+        LASSERT (!in_interrupt());
+        LASSERT (current == kibnal_data.kib_connd);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
+        
+        kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
+
+        /* move QP to error state to make posted work items complete */
+        kibnal_set_qp_state(conn, vv_qp_state_error);
+
+        spin_lock(&conn->ibc_lock);
+
+        /* Complete all tx descs not waiting for sends to complete.
+         * NB we should be safe from RDMA now that the QP has changed state */
+
+        list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_waiting = 0;
+                
+                if (tx->tx_sending != 0)
+                        continue;
+
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+
+        list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_waiting ||
+                         tx->tx_sending != 0);
+
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_waiting = 0;
+                
+                if (tx->tx_sending != 0)
+                        continue;
+
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+        
+        spin_unlock(&conn->ibc_lock);
+
+        while (!list_empty(&zombies)) {
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                list_del(&tx->tx_list);
+                kibnal_tx_done (tx);
+        }
+
+        kibnal_handle_early_rxs(conn);
 }
 
-static void
-kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
+void
+kibnal_peer_connect_failed (kib_peer_t *peer, int active)
 {
-        LIST_HEAD        (zombies);
+        struct list_head  zombies;
         kib_tx_t         *tx;
         unsigned long     flags;
 
-        LASSERT (rc != 0);
+        /* Only the connd creates conns => single threaded */
+        LASSERT (!in_interrupt());
+        LASSERT (current == kibnal_data.kib_connd);
         LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
 
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
-        LASSERT (peer->ibp_connecting != 0);
-        peer->ibp_connecting--;
+        if (active) {
+                LASSERT (peer->ibp_connecting != 0);
+                peer->ibp_connecting--;
+        } else {
+                LASSERT (!kibnal_peer_active(peer));
+        }
+        
         if (peer->ibp_connecting != 0) {
                 /* another connection attempt under way (loopback?)... */
-                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
                 return;
         }
 
@@ -1848,15 +1912,9 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
                 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
                                                     IBNAL_MAX_RECONNECT_INTERVAL);
         
-                /* Take peer's blocked blocked transmits; I'll complete
-                 * them with error */
-                while (!list_empty (&peer->ibp_tx_queue)) {
-                        tx = list_entry (peer->ibp_tx_queue.next,
-                                         kib_tx_t, tx_list);
-                        
-                        list_del (&tx->tx_list);
-                        list_add_tail (&tx->tx_list, &zombies);
-                }
+                /* Take peer's blocked transmits to complete with error */
+                list_add(&zombies, &peer->ibp_tx_queue);
+                list_del_init(&peer->ibp_tx_queue);
                 
                 if (kibnal_peer_active(peer) &&
                     (peer->ibp_persistence == 0)) {
@@ -1868,996 +1926,896 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
                 LASSERT (list_empty(&peer->ibp_tx_queue));
         }
         
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-        if (!list_empty (&zombies))
-                CERROR ("Deleting messages for "LPX64": connection failed\n",
-                        peer->ibp_nid);
-
-        while (!list_empty (&zombies)) {
+        if (list_empty (&zombies)) 
+                return;
+        
+        CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid);
+        do {
                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
 
                 list_del (&tx->tx_list);
                 /* complete now */
                 tx->tx_status = -EHOSTUNREACH;
                 kibnal_tx_done (tx);
-        }
+        } while (!list_empty (&zombies));
 }
 
-static void
-kibnal_connreq_done (kib_conn_t *conn, int active, int status)
+void
+kibnal_connreq_done(kib_conn_t *conn, int active, int status)
 {
-        int               state = conn->ibc_state;
-        kib_peer_t       *peer = conn->ibc_peer;
-        kib_tx_t         *tx;
-        unsigned long     flags;
-        int               i;
+        static cm_reject_data_t   rej;
 
-        CDEBUG(D_NET, "Enter kibnal_connreq_done for conn=%p, active=%d, status=%d\n",
-               conn, active, status);
+        struct list_head   txs;
+        kib_peer_t        *peer = conn->ibc_peer;
+        kib_peer_t        *peer2;
+        unsigned long      flags;
+        kib_tx_t          *tx;
 
-        /* passive connection has no connreq & vice versa */
-        LASSERTF(!active == !(conn->ibc_connreq != NULL),
-                 "%d %p\n", active, conn->ibc_connreq);
+        /* Only the connd creates conns => single threaded */
+        LASSERT (!in_interrupt());
+        LASSERT (current == kibnal_data.kib_connd);
+        LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
 
         if (active) {
-                PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
-                conn->ibc_connreq = NULL;
-        }
-
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
-
-        LASSERT (peer->ibp_connecting != 0);
-        
-        if (status == 0) {                         
-                /* connection established... */
-                KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
-                conn->ibc_state = IBNAL_CONN_ESTABLISHED;
-
-                if (!kibnal_peer_active(peer)) {
-                        /* ...but peer deleted meantime */
-                        status = -ECONNABORTED;
-                }
+                LASSERT (peer->ibp_connecting > 0);
         } else {
-                KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
-                                            IBNAL_CONN_CONNECTING);
+                LASSERT (!kibnal_peer_active(peer));
         }
+        
+        PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+        conn->ibc_connvars = NULL;
 
-        if (status == 0) {
-                /* Everything worked! */
-
-                peer->ibp_connecting--;
-
-                /* +1 ref for ibc_list; caller(== CM)'s ref remains until
-                 * the IB_CM_IDLE callback */
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount);
-                list_add (&conn->ibc_list, &peer->ibp_conns);
-                
-                /* reset reconnect interval for next attempt */
-                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
-
-                /* post blocked sends to the new connection */
-                spin_lock (&conn->ibc_lock);
-                
-                while (!list_empty (&peer->ibp_tx_queue)) {
-                        tx = list_entry (peer->ibp_tx_queue.next, 
-                                         kib_tx_t, tx_list);
-                        
-                        list_del (&tx->tx_list);
-
-                        /* +1 ref for each tx */
-                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                               atomic_read (&conn->ibc_refcount));
-                        atomic_inc (&conn->ibc_refcount);
-                        kibnal_queue_tx_locked (tx, conn);
-                }
-                
-                spin_unlock (&conn->ibc_lock);
-
-                /* Nuke any dangling conns from a different peer instance... */
-                kibnal_close_stale_conns_locked (conn->ibc_peer,
-                                                 conn->ibc_incarnation);
+        if (status != 0) {
+                /* failed to establish connection */
+                switch (conn->ibc_state) {
+                default:
+                        LBUG();
+                case IBNAL_CONN_ACTIVE_CHECK_REPLY:
+                        /* got a connection reply but failed checks */
+                        LASSERT (active);
+                        memset(&rej, 0, sizeof(rej));
+                        rej.reason = cm_rej_code_usr_rej;
+                        cm_reject(conn->ibc_cep, &rej);
+                        break;
 
-                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+                case IBNAL_CONN_ACTIVE_CONNECT:
+                        LASSERT (active);
+                        cm_cancel(conn->ibc_cep);
+                        kibnal_pause(HZ/10);
+                        /* cm_connect() failed immediately or
+                         * callback returned failure */
+                        break;
 
-                /* queue up all the receives */
-                for (i = 0; i < IBNAL_RX_MSGS; i++) {
-                        /* +1 ref for rx desc */
-                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                               atomic_read (&conn->ibc_refcount));
-                        atomic_inc (&conn->ibc_refcount);
+                case IBNAL_CONN_ACTIVE_ARP:
+                        LASSERT (active);
+                        /* ibat_get_ib_data() failed immediately 
+                         * or callback returned failure */
+                        break;
 
-                        CDEBUG(D_NET, "RX[%d] %p->%p\n",
-                               i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg);
+                case IBNAL_CONN_INIT:
+                        break;
 
-                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
+                case IBNAL_CONN_PASSIVE_WAIT:
+                        LASSERT (!active);
+                        /* cm_accept callback returned failure */
+                        break;
                 }
 
-                kibnal_check_sends (conn);
+                kibnal_peer_connect_failed(conn->ibc_peer, active);
+                kibnal_conn_disconnected(conn);
                 return;
         }
 
-        /* connection failed */
-        if (state == IBNAL_CONN_CONNECTING) {
-                /* schedule for connd to close */
-                kibnal_close_conn_locked (conn, status);
-        } else {
-                /* Don't have a CM comm_id; just wait for refs to drain */
-                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
-        } 
-
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
-
-        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+        /* connection established */
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
-        /* If we didn't establish the connection we don't have to pass
-         * through the disconnect protocol before dropping the CM ref */
-        if (state < IBNAL_CONN_CONNECTING) 
-                kibnal_put_conn (conn);
-}
-
-static int
-kibnal_accept (kib_conn_t **connp, cm_cep_handle_t *cep,
-                ptl_nid_t nid, __u64 incarnation, int queue_depth)
-{
-        kib_conn_t    *conn = kibnal_create_conn();
-        kib_peer_t    *peer;
-        kib_peer_t    *peer2;
-        unsigned long  flags;
-
-        if (conn == NULL)
-                return (-ENOMEM);
-
-        if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
-                CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
-                       nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
-                atomic_dec (&conn->ibc_refcount);
-                kibnal_destroy_conn(conn);
-                return (-EPROTO);
+        if (active) {
+                LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
+        } else {
+                LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
         }
         
-        /* assume 'nid' is a new peer */
-        peer = kibnal_create_peer (nid);
-        if (peer == NULL) {
-                CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_dec (&conn->ibc_refcount);
-                kibnal_destroy_conn(conn);
-                return (-ENOMEM);
+        kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
+
+        if (!active) {
+                peer2 = kibnal_find_peer_locked(peer->ibp_nid);
+                if (peer2 != NULL) {
+                        /* already in the peer table; swap */
+                        conn->ibc_peer = peer2;
+                        kibnal_peer_addref(peer2);
+                        kibnal_peer_decref(peer);
+                        peer = conn->ibc_peer;
+                } else {
+                        /* add 'peer' to the peer table */
+                        kibnal_peer_addref(peer);
+                        list_add_tail(&peer->ibp_list,
+                                      kibnal_nid2peerlist(peer->ibp_nid));
+                }
         }
         
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        /* Add conn to peer's list and nuke any dangling conns from a different
+         * peer instance... */
+        kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
+        list_add(&conn->ibc_list, &peer->ibp_conns);
+        kibnal_close_stale_conns_locked (conn->ibc_peer,
+                                         conn->ibc_incarnation);
+
+        if (!kibnal_peer_active(peer) ||        /* peer has been deleted */
+            conn->ibc_comms_error != 0 ||       /* comms error */
+            conn->ibc_disconnect) {             /* need to disconnect */
+                
+                /* start to shut down connection */
+                kibnal_close_conn_locked(conn, -ECONNABORTED);
 
-        peer2 = kibnal_find_peer_locked(nid);
-        if (peer2 == NULL) {
-                /* peer table takes my ref on peer */
-                list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
-        } else {
-                kib_peer_decref (peer);
-                peer = peer2;
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+                kibnal_peer_connect_failed(peer, active);
+                return;
         }
 
-        kib_peer_addref(peer); /* +1 ref for conn */
-        peer->ibp_connecting++;
-
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
-
-        conn->ibc_peer = peer;
-        conn->ibc_state = IBNAL_CONN_CONNECTING;
-        /* conn->ibc_cep is set when cm_accept is called */
-        conn->ibc_incarnation = incarnation;
-        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
-
-        *connp = conn;
-        return (0);
-}
+        if (active)
+                peer->ibp_connecting--;
 
-static void kibnal_move_qp_to_error(kib_conn_t *conn)
-{
-        vv_qp_attr_t qp_attr;
-        vv_return_t retval;
+        /* grab pending txs while I have the lock */
+        list_add(&txs, &peer->ibp_tx_queue);
+        list_del_init(&peer->ibp_tx_queue);
+        
+        /* reset reconnect interval for next attempt */
+        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+        /* Schedule blocked txs */
+        spin_lock (&conn->ibc_lock);
+        while (!list_empty (&txs)) {
+                tx = list_entry (txs.next, kib_tx_t, tx_list);
+                list_del (&tx->tx_list);
 
-        qp_attr.modify.qp_modify_into_state = vv_qp_state_error;
-        qp_attr.modify.vv_qp_attr_mask      = VV_QP_AT_STATE;
-        qp_attr.modify.qp_type              = vv_qp_type_r_conn;
+                kibnal_queue_tx_locked (tx, conn);
+        }
+        spin_unlock (&conn->ibc_lock);
+        kibnal_check_sends (conn);
 
-        retval = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &qp_attr, &conn->ibc_qp_attrs);
-        if (retval)
-                CERROR("couldn't move qp into error state, error %d\n", retval);
+        /* schedule blocked rxs */
+        kibnal_handle_early_rxs(conn);
 }
 
-static void kibnal_flush_pending(kib_conn_t *conn)
+void
+kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
 {
-        LIST_HEAD        (zombies); 
-        struct list_head *tmp;
-        struct list_head *nxt;
-        kib_tx_t         *tx;
-        unsigned long     flags;
-        int               done;
-
-        /* NB we wait until the connection has closed before completing
-         * outstanding passive RDMAs so we can be sure the network can't 
-         * touch the mapped memory any more. */
-        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
-
-        /* set the QP to the error state so that we get flush callbacks
-         * on our posted receives which can then drop their conn refs */
-        kibnal_move_qp_to_error(conn);
-
-        spin_lock_irqsave (&conn->ibc_lock, flags);
-
-        /* grab passive RDMAs not waiting for the tx callback */
-        list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
-                tx = list_entry (tmp, kib_tx_t, tx_list);
+        static cm_dreply_data_t drep;           /* just zeroed space */
+        
+        kib_conn_t             *conn = (kib_conn_t *)arg;
+        unsigned long           flags;
+        
+        /* CAVEAT EMPTOR: tasklet context */
 
-                LASSERT (tx->tx_passive_rdma ||
-                         !tx->tx_passive_rdma_wait);
+        switch (cmdata->status) {
+        default:
+                LBUG();
+                
+        case cm_event_disconn_request:
+                /* IBNAL_CONN_ACTIVE_RTU:  gets closed in kibnal_connreq_done
+                 * IBNAL_CONN_ESTABLISHED: I start it closing
+                 * otherwise:              it's closing anyway */
+                cm_disconnect(conn->ibc_cep, NULL, &drep);
+                cm_cancel(conn->ibc_cep);
 
-                LASSERT (tx->tx_passive_rdma_wait ||
-                         tx->tx_sending != 0);
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+                LASSERT (!conn->ibc_disconnect);
+                conn->ibc_disconnect = 1;
 
-                /* still waiting for tx callback? */
-                if (!tx->tx_passive_rdma_wait)
-                        continue;
+                switch (conn->ibc_state) {
+                default:
+                        LBUG();
 
-                tx->tx_status = -ECONNABORTED;
-                tx->tx_passive_rdma_wait = 0;
-                done = (tx->tx_sending == 0);
+                case IBNAL_CONN_ACTIVE_RTU:
+                        /* kibnal_connreq_done is getting there; It'll see
+                         * ibc_disconnect set... */
+                        kibnal_conn_decref(conn); /* lose my ref */
+                        break;
 
-                if (!done)
-                        continue;
+                case IBNAL_CONN_ESTABLISHED:
+                        /* kibnal_connreq_done got there already; get
+                         * disconnect going... */
+                        kibnal_close_conn_locked(conn, 0);
+                        kibnal_conn_decref(conn); /* lose my ref */
+                        break;
 
-                list_del (&tx->tx_list);
-                list_add (&tx->tx_list, &zombies);
-        }
+                case IBNAL_CONN_DISCONNECT1:
+                        /* kibnal_terminate_conn is getting there; It'll see
+                         * ibc_disconnect set... */
+                        kibnal_conn_decref(conn); /* lose my ref */
+                        break;
 
-        /* grab all blocked transmits */
-        list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
-                tx = list_entry (tmp, kib_tx_t, tx_list);
+                case IBNAL_CONN_DISCONNECT2:
+                        /* kibnal_terminate_conn got there already; complete
+                         * the disconnect.  NB kib_connd_conns takes my ref */
+                        spin_lock(&kibnal_data.kib_connd_lock);
+                        list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
+                        wake_up(&kibnal_data.kib_connd_waitq);
+                        spin_unlock(&kibnal_data.kib_connd_lock);
+                        break;
+                }
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+                return;
                 
-                list_del (&tx->tx_list);
-                list_add (&tx->tx_list, &zombies);
-        }
-        
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-        while (!list_empty(&zombies)) {
-                tx = list_entry (zombies.next, kib_tx_t, tx_list);
-
-                list_del(&tx->tx_list);
-                kibnal_tx_done (tx);
+        case cm_event_disconn_timeout:
+        case cm_event_disconn_reply:
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+                LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
+                LASSERT (!conn->ibc_disconnect);
+                conn->ibc_disconnect = 1;
+
+                /* kibnal_terminate_conn sent the disconnect request.  
+                 * NB kib_connd_conns takes my ref */
+                spin_lock(&kibnal_data.kib_connd_lock);
+                list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
+                wake_up(&kibnal_data.kib_connd_waitq);
+                spin_unlock(&kibnal_data.kib_connd_lock);
+
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+                break;
+                
+        case cm_event_connected:
+        case cm_event_conn_timeout:
+        case cm_event_conn_reject:
+                LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
+                conn->ibc_connvars->cv_conndata = *cmdata;
+                
+                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+                list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
+                wake_up(&kibnal_data.kib_connd_waitq);
+                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+                break;
         }
 }
 
-static void
-kibnal_reject (cm_cep_handle_t cep, cm_rej_code_t reason)
+void
+kibnal_check_passive_wait(kib_conn_t *conn)
 {
-        cm_reject_data_t *rej;
-
-        PORTAL_ALLOC(rej, sizeof(*rej));
-        if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */
-                return;  
-
-        rej->reason = reason;
-        cm_reject(cep, rej);
-        PORTAL_FREE(rej, sizeof(*rej));
-}
+        int     rc;
 
-static void get_av_from_path(ib_path_record_v2_t *path, vv_add_vec_t *av)
-{
-        av->service_level = path->sl;
-        av->grh_flag = 0;       /* TODO: correct? */
-        av->dlid = path->dlid;
-        av->pmtu = path->mtu;
-
-        /* From sdp-hca-params.h. */
-        switch(path->rate) {
-        case 2:
-                av->max_static_rate = 1;
-                break;
-        case 3:
-        case 4:
+        switch (conn->ibc_connvars->cv_conndata.status) {
         default:
-                av->max_static_rate = 0;
+                LBUG();
+                
+        case cm_event_connected:
+                kibnal_conn_addref(conn); /* ++ ref for CM callback */
+                rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
+                if (rc != 0)
+                        conn->ibc_comms_error = rc;
+                /* connection _has_ been established; it's just that we've had
+                 * an error immediately... */
+                kibnal_connreq_done(conn, 0, 0);
+                break;
+                
+        case cm_event_conn_timeout:
+                kibnal_connreq_done(conn, 0, -ETIMEDOUT);
+                break;
+                
+        case cm_event_conn_reject:
+                kibnal_connreq_done(conn, 0, -ECONNRESET);
                 break;
         }
+}
 
-        av->l_ack_timeout = IBNAL_ACK_TIMEOUT;
-        av->retry_count = IBNAL_RETRY;
-        av->rnr_retry_count = IBNAL_RNR_RETRY; 
-        av->source_path_bit = 0;
-
-        av->global_dest.flow_lable = path->flow_label;
-        av->global_dest.hope_limit = path->hop_limut;
-        av->global_dest.traffic_class = path->traffic_class;
-        av->global_dest.s_gid_index = 0;
-        av->global_dest.d_gid = path->dgid;
-};
-
-static vv_return_t
-kibnal_qp_rts(vv_qp_h_t qp_handle, __u32 qpn, __u8 resp_res, 
-              ib_path_record_v2_t *path, __u8 init_depth, __u32 send_psn)
+void
+kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
 {
-        vv_qp_attr_t qp_attr;
-        vv_return_t retval;
+        static cm_reply_data_t  reply;
+        static cm_reject_data_t reject;
 
-        ENTRY;
+        kib_msg_t          *rxmsg = (kib_msg_t *)cmreq->priv_data;
+        kib_msg_t          *txmsg;
+        kib_conn_t         *conn = NULL;
+        int                 rc = 0;
+        kib_connvars_t     *cv;
+        kib_peer_t         *tmp_peer;
+        cm_return_t         cmrc;
+        vv_return_t         vvrc;
+        
+        /* I'm the connd executing in thread context
+         * No concurrency problems with static data! */
+        LASSERT (!in_interrupt());
+        LASSERT (current == kibnal_data.kib_connd);
 
-#if 1
-        /* TODO - Hack. I don't know whether I get bad values from the
-         * stack or if I'm using the wrong names. */
-        resp_res = 8;
-        init_depth = 8;
-#endif
+        if (cmreq->sid != IBNAL_SERVICE_NUMBER) {
+                CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
+                       cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER);
+                goto reject;
+        }
 
-        /* RTR */
-        qp_attr.modify.qp_modify_into_state = vv_qp_state_rtr;
-        qp_attr.modify.vv_qp_attr_mask =
-                VV_QP_AT_STATE | 
-                VV_QP_AT_ADD_VEC |
-                VV_QP_AT_DEST_QP |
-                VV_QP_AT_R_PSN |
-                VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
-                VV_QP_AT_MIN_RNR_NAK_T | VV_QP_AT_OP_F;
-
-        qp_attr.modify.qp_type = vv_qp_type_r_conn;
-
-        get_av_from_path(path, &qp_attr.modify.params.rtr.remote_add_vec);
-        qp_attr.modify.params.rtr.destanation_qp = qpn;
-        qp_attr.modify.params.rtr.receive_psn = IBNAL_STARTING_PSN;
-        qp_attr.modify.params.rtr.responder_rdma_r_atom_num = resp_res;
-        qp_attr.modify.params.rtr.opt_min_rnr_nak_timer = 16; /* 20 ms */
-
-        /* For now, force MTU to 1KB (Voltaire's advice). */
-        qp_attr.modify.params.rtr.remote_add_vec.pmtu = vv_mtu_1024;
-
-        retval = vv_qp_modify(kibnal_data.kib_hca, qp_handle, &qp_attr, NULL);
-        if (retval) {
-                CERROR("Cannot modify QP to RTR: %d\n", retval);
-                RETURN(retval);
-        }
-
-        /* RTS */
-        qp_attr.modify.qp_modify_into_state = vv_qp_state_rts;
-        qp_attr.modify.vv_qp_attr_mask = 
-                VV_QP_AT_STATE |
-                VV_QP_AT_L_ACK_T |
-                VV_QP_AT_RETRY_NUM |
-                VV_QP_AT_RNR_NUM |
-                VV_QP_AT_S_PSN |
-                VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM;
-        qp_attr.modify.qp_type = vv_qp_type_r_conn;             
-
-        qp_attr.modify.params.rts.local_ack_timeout = path->pkt_life_time + 2; /* 2 or 1? */ 
-        qp_attr.modify.params.rts.retry_num = IBNAL_RETRY;
-        qp_attr.modify.params.rts.rnr_num = IBNAL_RNR_RETRY;
-        qp_attr.modify.params.rts.send_psn = send_psn;
-        qp_attr.modify.params.rts.dest_out_rdma_r_atom_num = init_depth;
-        qp_attr.modify.params.rts.flow_control = 1; /* Stack does not use it. */
-
-        retval = vv_qp_modify(kibnal_data.kib_hca, qp_handle, &qp_attr, NULL);
-        if (retval) {
-                CERROR("Cannot modify QP to RTS: %d\n", retval);
-        }
-
-        RETURN(retval);
-}
+        rc = kibnal_unpack_msg(rxmsg, cm_REQ_priv_data_len);
+        if (rc != 0) {
+                CERROR("Can't parse connection request: %d\n", rc);
+                goto reject;
+        }
 
-static void
-kibnal_connect_reply (cm_cep_handle_t cep, cm_conn_data_t *info, kib_conn_t *conn)
-{
-        vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs;
-        kib_wire_connreq_t *wcr;
-        cm_reply_data_t *rep = &info->data.reply;
-        cm_rej_code_t reason;
-        vv_return_t retval;
+        if (rxmsg->ibm_type != IBNAL_MSG_CONNREQ) {
+                CERROR("Unexpected connreq msg type: %x from "LPX64"\n",
+                       rxmsg->ibm_type, rxmsg->ibm_srcnid);
+                goto reject;
+        }
 
-        wcr = (kib_wire_connreq_t *)info->data.reply.priv_data;
+        if (rxmsg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
+                CERROR("Can't accept "LPX64": bad dst nid "LPX64"\n",
+                       rxmsg->ibm_srcnid, rxmsg->ibm_dstnid);
+                goto reject;
+        }
 
-        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
-                CERROR ("Can't connect "LPX64": bad magic %08x\n",
-                        conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
-                GOTO(reject, reason = cm_rej_code_usr_rej);
+        if (rxmsg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
+                CERROR("Can't accept "LPX64": incompatible queue depth %d (%d wanted)\n",
+                       rxmsg->ibm_srcnid, rxmsg->ibm_u.connparams.ibcp_queue_depth, 
+                       IBNAL_MSG_QUEUE_SIZE);
+                goto reject;
         }
-        
-        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
-                CERROR ("Can't connect "LPX64": bad version %d\n",
-                        conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
-                GOTO(reject, reason = cm_rej_code_usr_rej);
+
+        if (rxmsg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
+                CERROR("Can't accept "LPX64": message size %d too big (%d max)\n",
+                       rxmsg->ibm_srcnid, rxmsg->ibm_u.connparams.ibcp_max_msg_size, 
+                       IBNAL_MSG_SIZE);
+                goto reject;
         }
-                        
-        if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
-                CERROR ("Can't connect "LPX64": bad queue depth %d\n",
-                        conn->ibc_peer->ibp_nid
-                        le16_to_cpu(wcr->wcr_queue_depth));
-                GOTO(reject, reason = cm_rej_code_usr_rej);
+                
+        if (rxmsg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
+                CERROR("Can't accept "LPX64": max frags %d too big (%d max)\n",
+                       rxmsg->ibm_srcnid, rxmsg->ibm_u.connparams.ibcp_max_frags
+                       IBNAL_MAX_RDMA_FRAGS);
+                goto reject;
         }
-                        
-        if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
-                CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
-                        le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
-                GOTO(reject, reason = cm_rej_code_usr_rej);
+                
+        conn = kibnal_create_conn(cep);
+        if (conn == NULL) {
+                CERROR("Can't create conn for "LPX64"\n", rxmsg->ibm_srcnid);
+                goto reject;
         }
-
-        CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
-               conn, conn->ibc_peer->ibp_nid);
-
-        conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
-        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
-
-        retval = kibnal_qp_rts(conn->ibc_qp, rep->qpn, 
-                            min_t(__u8, rep->arb_initiator_depth,
-                                  ca_attr->max_read_atom_qp_outstanding),
-                            &conn->ibc_connreq->cr_path, 
-                            min_t(__u8, rep->arb_resp_res,
-                                  ca_attr->max_qp_depth_for_init_read_atom),
-                            rep->start_psn);
-
-        if (retval) {
-                CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n",
-                       conn, conn->ibc_peer->ibp_nid, retval);
-                GOTO(reject, reason = cm_rej_code_no_qp);
-        }
-
-        dump_qp(conn);
-
-        /* the callback arguments are ignored for an active accept */
-        /* TODO: memset cmrtu? */
-        retval = cm_accept(cep, NULL, &conn->ibc_connreq->cr_cm_rtu, kibnal_cm_callback, conn);
-        if (retval) {
-                CERROR("Connection %p -> "LPX64" CMAccept RTU failed: %d\n",
-                       conn, conn->ibc_peer->ibp_nid, retval);
-                kibnal_connreq_done (conn, 1, -ECONNABORTED);
-                /* XXX don't call reject after accept fails? */
-                return;
+        
+        /* assume 'rxmsg->ibm_srcnid' is a new peer */
+        tmp_peer = kibnal_create_peer (rxmsg->ibm_srcnid);
+        if (tmp_peer == NULL) {
+                CERROR("Can't create tmp peer for "LPX64"\n", rxmsg->ibm_srcnid);
+                kibnal_conn_decref(conn);
+                conn = NULL;
+                goto reject;
         }
 
-        CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
-               conn, conn->ibc_peer->ibp_nid);
-
-        kibnal_connreq_done (conn, 1, 0);
-
-        return;
+ &