Whamcloud - gitweb
* First cut iiblnd (compiles but untested)
authoreeb <eeb>
Sat, 15 Oct 2005 16:46:21 +0000 (16:46 +0000)
committereeb <eeb>
Sat, 15 Oct 2005 16:46:21 +0000 (16:46 +0000)
*   Removed #if LNET_SINGLE_THREADED and replaced with #if !HAVE_LIBPTHREAD

*   Fixed LND module descriptions to say LND (not NAL)

*   viblnd cleanups (removed unused struct members, fixed some formatting etc)

*   minor cleanup in text buffer allocations (lnet/lnet/config.c)

*   format string fix in klnd/ptllnd.c

*   fixed lustre/utils/obd.c to work without libpthread/fork (disables --threads)

23 files changed:
lnet/autoconf/lustre-lnet.m4
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/include/lnet/ptllnd.h
lnet/klnds/gmlnd/gmlnd_module.c
lnet/klnds/iiblnd/Makefile.in
lnet/klnds/iiblnd/iiblnd.c
lnet/klnds/iiblnd/iiblnd.h
lnet/klnds/iiblnd/iiblnd_cb.c
lnet/klnds/iiblnd/iiblnd_modparams.c [new file with mode: 0644]
lnet/klnds/openiblnd/openiblnd.c
lnet/klnds/ralnd/ralnd.c
lnet/klnds/socklnd/socklnd.c
lnet/klnds/viblnd/viblnd.c
lnet/klnds/viblnd/viblnd.h
lnet/klnds/viblnd/viblnd_cb.c
lnet/lnet/api-ni.c
lnet/lnet/config.c
lnet/lnet/lib-eq.c
lnet/lnet/lib-msg.c
lnet/lnet/peer.c
lnet/ulnds/ptllnd/ptllnd.c
lnet/ulnds/socklnd/procapi.c

index 8f0abf2..bd8d455 100644 (file)
@@ -863,7 +863,6 @@ if test x$enable_liblustre = xyes ; then
                AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread])
        else
                PTHREAD_LIBS=""
-               AC_DEFINE([LNET_SINGLE_THREADED], 1, [lnet single threaded])
        fi
        AC_SUBST(PTHREAD_LIBS)
 fi
index 8ff67d0..02c562e 100644 (file)
@@ -43,7 +43,7 @@ static inline int lnet_md_exhausted (lnet_libmd_t *md)
 #define LNET_MUTEX_DOWN(m) mutex_down(m)
 #define LNET_MUTEX_UP(m)   mutex_up(m)
 #else
-# if LNET_SINGLE_THREADED
+# if !HAVE_LIBPTHREAD
 #define LNET_SINGLE_THREADED_LOCK(l)            \
 do {                                            \
         LASSERT ((l) == 0);                     \
index dc063fb..dfbefdd 100644 (file)
@@ -412,7 +412,7 @@ typedef struct
         struct semaphore   ln_api_mutex;
         struct semaphore   ln_lnd_mutex;
 #else
-# if LNET_SINGLE_THREADED
+# if !HAVE_LIBPTHREAD
         int                ln_lock;
         int                ln_api_mutex;
         int                ln_lnd_mutex;
index 61805d8..9fc47b8 100755 (executable)
@@ -31,6 +31,7 @@
 
 /* NIDs are 64-bits on Lustre Portals */
 #define FMT_NID LPX64
+#define FMT_PID "%d"
 
 /* When using Lustre Portals Lustre completion semantics are imlicit*/
 #define PTL_MD_LUSTRE_COMPLETION_SEMANTICS      0
@@ -40,8 +41,9 @@
 /* Explicit NULL function pointer for EQ handler */
 #define PTL_EQ_HANDLER_NONE                     0
 
-/* NIDs are integers on Lustre Portals */
+/* NIDs are integers on Cray Portals */
 #define FMT_NID "%x"
+#define FMT_PID "%d"
 
 /* When using Cray Portals this is defined in the Cray Portals Header*/
 /*#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS */
index b74e21d..91ecf04 100644 (file)
@@ -119,5 +119,5 @@ module_init(gmnal_load);
 module_exit(gmnal_unload);
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel GM NAL v1.01");
+MODULE_DESCRIPTION("Kernel GM LND v1.01");
 MODULE_LICENSE("GPL");
index 8733812..7ee9b64 100644 (file)
@@ -1,5 +1,5 @@
 MODULES := kiiblnd
-kiiblnd-objs := iiblnd.o iiblnd_cb.o
+kiiblnd-objs := iiblnd.o iiblnd_cb.o iiblnd_modparams.o
 
 EXTRA_POST_CFLAGS := @IIBCPPFLAGS@
 
index d8253b9..281b9dc 100644 (file)
@@ -32,119 +32,435 @@ lnd_t the_kiblnd = {
         .lnd_recv          = kibnal_recv,
 };
 
-kib_tunables_t          kibnal_tunables;
+kib_data_t              kibnal_data;
 
-kib_data_t              kibnal_data = {
-        .kib_service_id = IBNAL_SERVICE_NUMBER,
-};
-
-#ifdef CONFIG_SYSCTL
-#define IBNAL_SYSCTL             202
+__u32 
+kibnal_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
 
-#define IBNAL_SYSCTL_TIMEOUT     1
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+        
+        /* ensure I don't return 0 (== no checksum) */
+        return (sum == 0) ? 1 : sum;
+}
 
-static ctl_table kibnal_ctl_table[] = {
-        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
-         &kibnal_tunables.kib_io_timeout, sizeof (int),
-         0644, NULL, &proc_dointvec},
-        { 0 }
-};
+void
+kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
+{
+        msg->ibm_type = type;
+        msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
 
-static ctl_table kibnal_top_ctl_table[] = {
-        {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
-        { 0 }
-};
-#endif
+void
+kibnal_pack_msg(kib_msg_t *msg, int credits, lnet_nid_t dstnid, 
+                __u64 dststamp, __u64 seq)
+{
+        /* CAVEAT EMPTOR! all message fields not set here should have been
+         * initialised previously. */
+        msg->ibm_magic    = IBNAL_MSG_MAGIC;
+        msg->ibm_version  = IBNAL_MSG_VERSION;
+        /*   ibm_type */
+        msg->ibm_credits  = credits;
+        /*   ibm_nob */
+        msg->ibm_cksum    = 0;
+        msg->ibm_srcnid   = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid,
+                                                  dstnid);
+        msg->ibm_srcstamp = kibnal_data.kib_incarnation;
+        msg->ibm_dstnid   = dstnid;
+        msg->ibm_dststamp = dststamp;
+        msg->ibm_seq      = seq;
+
+        if (*kibnal_tunables.kib_cksum) {
+                /* NB ibm_cksum zero while computing cksum */
+                msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
+        }
+}
 
-#ifdef unused
 void
-print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+kibnal_pack_connmsg(kib_msg_t *msg, int nob, int type, 
+                    lnet_nid_t dstnid, __u64 dststamp)
 {
-        char        name[32];
-        lnet_nid_t  nid;
+        LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
 
-        if (service == NULL) 
-        {
-                CWARN("tag       : %s\n"
-                      "status    : %d (NULL)\n", tag, rc);
-                return;
+        memset(msg, 0, nob);
+        kibnal_init_msg(msg, type, sizeof(kib_connparams_t));
+
+        msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
+        msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
+        msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
+
+        kibnal_pack_msg(msg, 0, dstnid, dststamp, 0);
+}
+
+int
+kibnal_unpack_msg(kib_msg_t *msg, int nob)
+{
+        const int hdr_size = offsetof(kib_msg_t, ibm_u);
+        __u32     msg_cksum;
+        int       flip;
+        int       msg_nob;
+#if !IBNAL_USE_FMR
+        int       i;
+        int       n;
+#endif
+        /* 6 bytes are enough to have received magic + version */
+        if (nob < 6) {
+                CERROR("Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+                flip = 0;
+        } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
+                flip = 1;
+        } else {
+                CERROR("Bad magic: %08x\n", msg->ibm_magic);
+                return -EPROTO;
+        }
+
+        if (msg->ibm_version != 
+            (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
+                CERROR("Bad version: %d\n", msg->ibm_version);
+                return -EPROTO;
+        }
+
+        if (nob < hdr_size) {
+                CERROR("Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+        if (msg_nob > nob) {
+                CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+                return -EPROTO;
+        }
+
+        /* checksum must be computed with ibm_cksum zero and BEFORE anything
+         * gets flipped */
+        msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+        msg->ibm_cksum = 0;
+        if (msg_cksum != 0 &&
+            msg_cksum != kibnal_cksum(msg, msg_nob)) {
+                CERROR("Bad checksum\n");
+                return -EPROTO;
         }
-        strncpy (name, service->ServiceName, sizeof(name)-1);
-        name[sizeof(name)-1] = 0;
-        nid = *kibnal_service_nid_field(service);
+        msg->ibm_cksum = msg_cksum;
         
-        CWARN("tag       : %s\n"
-              "status    : %d\n"
-              "service id: "LPX64"\n"
-              "name      : %s\n"
-              "NID       : %s\n", tag, rc,
-              service->RID.ServiceID, name,
-              libcfs_nid2str(nid));
-}
+        if (flip) {
+                /* leave magic unflipped as a clue to peer endianness */
+                __swab16s(&msg->ibm_version);
+                CLASSERT (sizeof(msg->ibm_type) == 1);
+                CLASSERT (sizeof(msg->ibm_credits) == 1);
+                msg->ibm_nob = msg_nob;
+                __swab64s(&msg->ibm_srcnid);
+                __swab64s(&msg->ibm_srcstamp);
+                __swab64s(&msg->ibm_dstnid);
+                __swab64s(&msg->ibm_dststamp);
+                __swab64s(&msg->ibm_seq);
+        }
+        
+        if (msg->ibm_srcnid == LNET_NID_ANY) {
+                CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+                return -EPROTO;
+        }
+
+        switch (msg->ibm_type) {
+        default:
+                CERROR("Unknown message type %x\n", msg->ibm_type);
+                return -EPROTO;
+                
+        case IBNAL_MSG_NOOP:
+                break;
+
+        case IBNAL_MSG_IMMEDIATE:
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
+                        CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
+                        return -EPROTO;
+                }
+                break;
+
+        case IBNAL_MSG_PUT_REQ:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
+                        CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
+                        return -EPROTO;
+                }
+                break;
+
+        case IBNAL_MSG_PUT_ACK:
+#if IBNAL_USE_FMR
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
+                        CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.putack)));
+                        return -EPROTO;
+                }
+
+                if (flip) {
+                        __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+                }
+#else
+                if (flip) {
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
+                }
+                
+                n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
+                if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
+                        CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", 
+                               n, IBNAL_MAX_RDMA_FRAGS);
+                        return -EPROTO;
+                }
+                
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
+                        CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
+                        return -EPROTO;
+                }
+
+                if (flip) {
+                        for (i = 0; i < n; i++) {
+                                __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
+                                __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr);
+                        }
+                }
 #endif
+                break;
 
-static void
-kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
-                              FSTATUS frc, uint32 madrc)
+        case IBNAL_MSG_GET_REQ:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
+                        CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.get)));
+                        return -EPROTO;
+                }
+#if IBNAL_USE_FMR
+                if (flip) {
+                        __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+                }
+#else                
+                if (flip) {
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
+                }
+
+                n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
+                if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
+                        CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", 
+                               n, IBNAL_MAX_RDMA_FRAGS);
+                        return -EPROTO;
+                }
+                
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
+                        CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
+                        return -EPROTO;
+                }
+                
+                if (flip)
+                        for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
+                                __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
+                                __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr);
+                        }
+#endif
+                break;
+
+        case IBNAL_MSG_PUT_NAK:
+        case IBNAL_MSG_PUT_DONE:
+        case IBNAL_MSG_GET_DONE:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
+                        CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.completion)));
+                        return -EPROTO;
+                }
+                if (flip)
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
+                break;
+
+        case IBNAL_MSG_CONNREQ:
+        case IBNAL_MSG_CONNACK:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
+                        CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
+                        return -EPROTO;
+                }
+                if (flip) {
+                        __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
+                        __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+                        __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
+                }
+                break;
+        }
+        return 0;
+}
+
+IB_HANDLE
+kibnal_create_cep(lnet_nid_t nid)
 {
-        *(FSTATUS *)arg = frc;
-        up (&kibnal_data.kib_nid_signal);
+        FSTATUS        frc;
+        __u32          u32val;
+        IB_HANDLE      cep;
+
+        cep = iibt_cm_create_cep(CM_RC_TYPE);
+        if (cep == NULL) {
+                CERROR ("Can't create CEP for %s\n",
+                        (nid == LNET_NID_ANY) ? "listener" :
+                        libcfs_nid2str(nid));
+                return NULL;
+        }
+
+        if (nid == LNET_NID_ANY) {
+                u32val = 1;
+                frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT,
+                                         (char *)&u32val, sizeof(u32val), 0);
+                if (frc != FSUCCESS) {
+                        CERROR("Can't set async_accept: %d\n", frc);
+                        goto failed;
+                }
+
+                u32val = 0;                     /* sets system max */
+                frc = iibt_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG,
+                                         (char *)&u32val, sizeof(u32val), 0);
+                if (frc != FSUCCESS) {
+                        CERROR("Can't set listen backlog: %d\n", frc);
+                        goto failed;
+                }
+        }
+        
+        u32val = 1;
+        frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
+                                 (char *)&u32val, sizeof(u32val), 0);
+        if (frc != FSUCCESS) {
+                CERROR("Can't set timewait_callback for %s: %d\n", 
+                        (nid == LNET_NID_ANY) ? "listener" :
+                        libcfs_nid2str(nid), frc);
+                goto failed;
+        }
+
+        return cep;
+        
+ failed:
+        iibt_cm_destroy_cep(cep);
+        return NULL;
 }
 
 #if IBNAL_CHECK_ADVERT
-static void
+void
 kibnal_service_query_done (void *arg, QUERY *qry, 
                            QUERY_RESULT_VALUES *qry_result)
 {
-        FSTATUS frc = qry_result->Status;
+        int                    *rcp = arg;
+        FSTATUS                 frc = qry_result->Status;
+        SERVICE_RECORD_RESULTS *svc_rslt;
+        SERVICE_RECORD         *svc;
+        lnet_nid_t              nid;
+
+        if (frc != FSUCCESS || qry_result->ResultDataSize == 0) {
+                CERROR("Error checking advert: status %d data size %d\n",
+                       frc, qry_result->ResultDataSize);
+                *rcp = -EIO;
+                goto out;
+        }
+
+        svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult;
 
-        if (frc != FSUCCESS &&
-            qry_result->ResultDataSize == 0)
-                frc = FERROR;
+        if (svc_rslt->NumServiceRecords < 1) {
+                CERROR("Check advert: %d records\n",
+                       svc->NumServiceRecords);
+                *rcp = -ENOENT;
+                goto out;
+        }
+
+        svc = &svc_rslt->ServiceRecords[0];
+        nid = le64_to_cpu(*kibnal_service_nid_field(svc));
         
-        *(FSTATUS *)arg = frc;
-        up (&kibnal_data.kib_nid_signal);
+        if (nid != kibnal_data.kib_ni->ni_nid) {
+                CERROR("Check advert: Bad NID %s (%s expected)\n",
+                       nid, kibnal_data.kib_ni->ni_nid);
+                *rcp = -EINVAL;
+                goto out;
+        }
+
+        if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) {
+                CERROR("Check advert: Bad ServiceID "LPX64" ("LPX64" expected)\n",
+                       svc->RID.ServiceID,
+                       *kibnal_tunables.kib_service_number);
+                *rcp = -EINVAL;
+                goto out;
+        }
+
+        if (svc->RID.ServiceGID.Type.Global.InterfaceID != 
+            kibnal_data.kib_port_guid) {
+                CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n",
+                       svc->RID.ServiceGID.Type.Global.InterfaceID,
+                       kibnal_data.kib_port_guid);
+                *rcp = -EINVAL;
+                goto out;
+        }
+
+        if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) {
+                CERROR("Check advert: Bad PKEY %04x (%04x expected)\n",
+                       svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey);
+                *rcp = -EINVAL;
+                goto out;
+        }
+
+        CDEBUG(D_WARNING, "Check advert OK\n");
+        *rcp = 0;
+                
+ out:
+        up (&kibnal_data.kib_listener_signal);                
 }
 
-static void
+int
 kibnal_check_advert (void)
 {
-        QUERY                  *qry;
-        IB_SERVICE_RECORD      *svc;
-        FSTATUS                 frc;
-        FSTATUS                 frc2;
+        /* single-threaded */
+        static QUERY               qry;
 
-        LIBCFS_ALLOC(qry, sizeof(*qry));
-        if (qry == NULL)
-                return;
+        FSTATUS                    frc;
+        int                        rc;
 
-        memset (qry, 0, sizeof(*qry));
-        qry->InputType = InputTypeServiceRecord;
-        qry->OutputType = OutputTypeServiceRecord;
-        qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
-        svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
-        kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
+        memset (&qry, 0, sizeof(qry));
+        qry.InputType = InputTypeServiceRecord;
+        qry.OutputType = OutputTypeServiceRecord;
+        kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord,
+                                kibnal_data.kib_ni->ni_nid);
+        qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
 
-        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, 
                                                     kibnal_data.kib_port_guid,
-                                                    qry,
+                                                    &qry, 
                                                     kibnal_service_query_done,
-                                                    NULL, &frc2);
-        if (frc != FSUCCESS && frc != FPENDING) {
+                                                    &kibnal_data.kib_sdretry, 
+                                                    &rc);
+        if (frc != FPENDING) {
                 CERROR ("Immediate error %d checking SM service\n", frc);
-        } else {
-                down (&kibnal_data.kib_nid_signal);
-                frc = frc2;
-
-                if (frc != 0)
-                        CERROR ("Error %d checking SM service\n", rc);
+                return -EIO;
         }
-
-        return (rc);
+        
+        down (&kibnal_data.kib_listener_signal);
+        
+        if (rc != 0)
+                CERROR ("Error %d checking SM service\n", rc);
+        return rc;
+}
+#else
+int
+kibnal_check_advert(void)
+{
+        return 0;
 }
 #endif
 
-static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
+void 
+kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
 {
         IB_SERVICE_RECORD     *svc;
 
@@ -152,7 +468,7 @@ static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
         fod->Type = type;
 
         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
-        svc->RID.ServiceID = kibnal_data.kib_service_id;
+        svc->RID.ServiceID = *kibnal_tunables.kib_service_number;
         svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
         svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
         svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
@@ -161,86 +477,96 @@ static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
         kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
 }
 
-static int
-kibnal_advertise (void)
+void
+kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
+                              FSTATUS frc, uint32 madrc)
 {
-        FABRIC_OPERATION_DATA *fod;
-        IB_SERVICE_RECORD     *svc;
-        FSTATUS                frc;
-        FSTATUS                frc2;
-
-        LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
+        *(FSTATUS *)arg = frc;
+        up (&kibnal_data.kib_listener_signal);
+}
 
-        LIBCFS_ALLOC(fod, sizeof(*fod));
-        if (fod == NULL)
-                return (-ENOMEM);
+int
+kibnal_advertise (void)
+{
+        /* Single threaded here */
+        static FABRIC_OPERATION_DATA fod;
+
+        IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
+        FSTATUS            frc;
+        FSTATUS            frc2;
+
+        if (strlen(*kibnal_tunables.kib_service_name) >=
+            sizeof(svc->ServiceName)) {
+                CERROR("Service name '%s' too long (%d chars max)\n",
+                       *kibnal_tunables.kib_service_name,
+                       sizeof(svc->ServiceName) - 1);
+                return -EINVAL;
+        }
 
-        fill_fod(fod, FabOpSetServiceRecord);
-        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+        kibnal_fill_fod(&fod, FabOpSetServiceRecord);
 
         CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n", 
-               svc->RID.ServiceID, 
-               svc->ServiceName, 
-               libcfs_nid2str(*kibnal_service_nid_field(svc)));
+               svc->RID.ServiceID, svc->ServiceName, 
+               libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
 
         frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
                                             kibnal_data.kib_port_guid,
-                                            fod, kibnal_service_setunset_done, 
-                                            NULL, &frc2);
+                                            &fod, 
+                                            kibnal_service_setunset_done, 
+                                            &kibnal_data.kib_sdretry,
+                                            &frc2);
 
         if (frc != FSUCCESS && frc != FPENDING) {
                 CERROR ("Immediate error %d advertising NID %s\n",
                         frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
-                goto out;
+                return -EIO;
         }
 
-        down (&kibnal_data.kib_nid_signal);
+        down (&kibnal_data.kib_listener_signal);
 
         frc = frc2;
-        if (frc != FSUCCESS)
-                CERROR ("Error %d advertising BUD %s\n",
-                        frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
-out:
-        LIBCFS_FREE(fod, sizeof(*fod));
-        return (frc == FSUCCESS) ? 0 : -EINVAL;
+        if (frc == FSUCCESS)
+                return 0;
+        
+        CERROR ("Error %d advertising %s\n",
+                frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+        return -EIO;
 }
 
-static void
+void
 kibnal_unadvertise (int expect_success)
 {
-        FABRIC_OPERATION_DATA *fod;
-        IB_SERVICE_RECORD     *svc;
-        FSTATUS                frc;
-        FSTATUS                frc2;
+        /* single threaded */
+        static FABRIC_OPERATION_DATA fod;
 
-        LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
+        IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
+        FSTATUS            frc;
+        FSTATUS            frc2;
 
-        LIBCFS_ALLOC(fod, sizeof(*fod));
-        if (fod == NULL)
-                return;
+        LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
 
-        fill_fod(fod, FabOpDeleteServiceRecord);
-        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+        kibnal_fill_fod(&fod, FabOpDeleteServiceRecord);
 
         CDEBUG(D_NET, "Unadvertising service %s:%s\n",
                svc->ServiceName, 
-               libcfs_nid2str(*kibnal_service_nid_field(svc)));
+               libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
         
         frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
                                             kibnal_data.kib_port_guid,
-                                            fod, kibnal_service_setunset_done, 
-                                            NULL, &frc2);
-
+                                            &fod, 
+                                            kibnal_service_setunset_done, 
+                                            &kibnal_data.kib_sdretry, 
+                                            &frc2);
         if (frc != FSUCCESS && frc != FPENDING) {
                 CERROR ("Immediate error %d unadvertising NID %s\n",
                         frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
-                goto out;
+                return;
         }
 
-        down (&kibnal_data.kib_nid_signal);
+        down (&kibnal_data.kib_listener_signal);
 
-        if ((frc2 == FSUCCESS) == !!expect_success)
-                goto out;
+        if ((frc2 -= FSUCCESS) == !!expect_success)
+                return;
 
         if (expect_success)
                 CERROR("Error %d unadvertising NID %s\n",
@@ -248,118 +574,104 @@ kibnal_unadvertise (int expect_success)
         else
                 CWARN("Removed conflicting NID %s\n",
                       libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
- out:
-        LIBCFS_FREE(fod, sizeof(*fod));
 }
 
-static int
-kibnal_set_mynid(lnet_nid_t nid)
+void
+kibnal_stop_listener(int normal_shutdown)
 {
-        struct timeval tv;
-        int            rc;
+        /* NB this also disables peer creation and destroys all existing
+         * peers */
+        IB_HANDLE      cep = kibnal_data.kib_listener_cep;
+        unsigned long  flags;
         FSTATUS        frc;
 
-        CDEBUG(D_IOCTL, "setting mynid to %s (old nid=%s)\n",
-               libcfs_nid2str(nid), 
-               libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+        LASSERT (cep != NULL);
 
-        do_gettimeofday(&tv);
+        kibnal_unadvertise(normal_shutdown);
 
-        down (&kibnal_data.kib_nid_mutex);
+        frc = iibt_cm_cancel(cep);
+        if (frc != FSUCCESS && frc != FPENDING)
+                CERROR ("Error %d stopping listener\n", frc);
 
-        if (nid == kibnal_data.kib_ni->ni_nid) {
-                /* no change of NID */
-                up (&kibnal_data.kib_nid_mutex);
-                return (0);
-        }
+        down(&kibnal_data.kib_listener_signal);
 
-        CDEBUG(D_NET, "NID %s(%s)\n",
-               libcfs_nid2str(kibnal_data.kib_ni->ni_nid), 
-               libcfs_nid2str(nid));
-        
-        if (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY) {
+        frc = iibt_cm_destroy_cep(cep);
+        if (frc != FSUCCESS)
+                CERROR ("Error %d destroying listener CEP\n", frc);
 
-                kibnal_unadvertise (1);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        /* This assignment disables peer creation */
+        kibnal_data.kib_listener_cep = NULL;
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-                frc = iibt_cm_cancel(kibnal_data.kib_cep);
-                if (frc != FSUCCESS && frc != FPENDING)
-                        CERROR ("Error %d stopping listener\n", frc);
+        /* Start to tear down any peers created while the listener was
+         * running */
+        kibnal_del_peer(LNET_NID_ANY);
+}
 
-                frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
-                if (frc != FSUCCESS)
-                        CERROR ("Error %d destroying CEP\n", frc);
+int
+kibnal_start_listener(void)
+{
+        /* NB this also enables peer creation */
 
-                kibnal_data.kib_cep = NULL;
-        }
-        
-        kibnal_data.kib_ni->ni_nid = nid;
-        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
-        
-        /* Delete all existing peers and their connections after new
-         * NID/incarnation set to ensure no old connections in our brave
-         * new world. */
-        kibnal_del_peer(LNET_NID_ANY);
+        IB_HANDLE      cep;
+        CM_LISTEN_INFO info;
+        unsigned long  flags;
+        int            rc;
+        FSTATUS        frc;
+        __u32          u32val;
 
-        if (kibnal_data.kib_ni->ni_nid == LNET_NID_ANY) {
-                /* No new NID to install */
-                up (&kibnal_data.kib_nid_mutex);
-                return (0);
-        }
+        LASSERT (kibnal_data.kib_listener_cep == NULL);
+        init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
 
         /* remove any previous advert (crashed node etc) */
         kibnal_unadvertise(0);
 
-        kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
-        if (kibnal_data.kib_cep == NULL) {
-                CERROR ("Can't create CEP\n");
-                rc = -ENOMEM;
-        } else {
-                CM_LISTEN_INFO info;
-                memset (&info, 0, sizeof(info));
-                info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
-
-                frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
-                                     kibnal_listen_callback, NULL);
-                if (frc != FSUCCESS && frc != FPENDING) {
-                        CERROR ("iibt_cm_listen error: %d\n", frc);
-                        rc = -EINVAL;
-                } else {
-                        rc = 0;
-                }
-        }
-        
-        if (rc == 0) {
-                rc = kibnal_advertise();
-                if (rc == 0) {
-#if IBNAL_CHECK_ADVERT
-                        kibnal_check_advert();
-#endif
-                        up (&kibnal_data.kib_nid_mutex);
-                        return (0);
-                }
-                
-                iibt_cm_cancel (kibnal_data.kib_cep);
-                iibt_cm_destroy_cep (kibnal_data.kib_cep);
-                /* remove any peers that sprung up while I failed to
-                 * advertise myself */
-                kibnal_del_peer(LNET_NID_ANY);
+        cep = kibnal_create_cep(LNET_NID_ANY);
+        if (cep == NULL)
+                return -ENOMEM;
+
+        memset (&info, 0, sizeof(info));
+        info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number;
+
+        frc = iibt_cm_listen(cep, &info, kibnal_listen_callback, NULL);
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("iibt_cm_listen error: %d\n", frc);
+
+                iibt_cm_destroy_cep(cep);
+                return -EIO;
         }
 
-        kibnal_data.kib_ni->ni_nid = LNET_NID_ANY;
-        up (&kibnal_data.kib_nid_mutex);
-        return (rc);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        /* This assignment enables peer creation */
+        kibnal_data.kib_listener_cep = cep;
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+        rc = kibnal_advertise();
+        if (rc == 0)
+                rc = kibnal_check_advert();
+
+        if (rc == 0)
+                return 0;
+
+        kibnal_stop_listener(0);
+        return rc;
 }
 
-kib_peer_t *
-kibnal_create_peer (lnet_nid_t nid)
+int
+kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
 {
-        kib_peer_t *peer;
+        kib_peer_t    *peer;
+        unsigned long  flags;
+        int            rc;
 
         LASSERT (nid != LNET_NID_ANY);
 
         LIBCFS_ALLOC (peer, sizeof (*peer));
-        if (peer == NULL)
-                return (NULL);
+        if (peer == NULL) {
+                CERROR("Cannot allocate peer\n");
+                return -ENOMEM;
+        }
 
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
 
@@ -372,8 +684,31 @@ kibnal_create_peer (lnet_nid_t nid)
 
         peer->ibp_reconnect_interval = 0;       /* OK to connect at any time */
 
-        atomic_inc (&kibnal_data.kib_npeers);
-        return (peer);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        
+        if (atomic_read(&kibnal_data.kib_npeers) >=
+            *kibnal_tunables.kib_concurrent_peers) {
+                rc = -EOVERFLOW;        /* !! but at least it distinguishes */
+        } else if (kibnal_data.kib_listener_cep == NULL) {
+                rc = -ESHUTDOWN;        /* shutdown has started */
+        } else {
+                rc = 0;
+                /* npeers only grows with the global lock held */
+                atomic_inc(&kibnal_data.kib_npeers);
+        }
+        
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+        if (rc != 0) {
+                CERROR("Can't create peer: %s\n", 
+                       (rc == -ESHUTDOWN) ? "shutting down" : 
+                       "too many peers");
+                LIBCFS_FREE(peer, sizeof(*peer));
+        } else {
+                *peerp = peer;
+        }
+        
+        return rc;
 }
 
 void
@@ -424,21 +759,6 @@ kibnal_find_peer_locked (lnet_nid_t nid)
         return (NULL);
 }
 
-kib_peer_t *
-kibnal_get_peer (lnet_nid_t nid)
-{
-        kib_peer_t     *peer;
-        unsigned long   flags;
-
-        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
-        peer = kibnal_find_peer_locked (nid);
-        if (peer != NULL)                       /* +1 ref for caller? */
-                kib_peer_addref(peer);
-        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
-
-        return (peer);
-}
-
 void
 kibnal_unlink_peer_locked (kib_peer_t *peer)
 {
@@ -448,10 +768,10 @@ kibnal_unlink_peer_locked (kib_peer_t *peer)
         LASSERT (kibnal_peer_active(peer));
         list_del_init (&peer->ibp_list);
         /* lose peerlist's ref */
-        kib_peer_decref(peer);
+        kibnal_peer_decref(peer);
 }
 
-static int
+int
 kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep)
 {
         kib_peer_t        *peer;
@@ -486,25 +806,26 @@ kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep)
         return (-ENOENT);
 }
 
-static int
+int
 kibnal_add_persistent_peer (lnet_nid_t nid)
 {
         unsigned long      flags;
         kib_peer_t        *peer;
         kib_peer_t        *peer2;
+        int                rc;
         
         if (nid == LNET_NID_ANY)
                 return (-EINVAL);
 
-        peer = kibnal_create_peer (nid);
-        if (peer == NULL)
-                return (-ENOMEM);
+        rc = kibnal_create_peer(&peer, nid);
+        if (rc != 0)
+                return rc;
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
-                kib_peer_decref (peer);
+                kibnal_peer_decref (peer);
                 peer = peer2;
         } else {
                 /* peer table takes existing ref on peer */
@@ -518,7 +839,7 @@ kibnal_add_persistent_peer (lnet_nid_t nid)
         return (0);
 }
 
-static void
+void
 kibnal_del_peer_locked (kib_peer_t *peer)
 {
         struct list_head *ctmp;
@@ -583,7 +904,7 @@ kibnal_del_peer (lnet_nid_t nid)
         return (rc);
 }
 
-static kib_conn_t *
+kib_conn_t *
 kibnal_get_conn_by_idx (int index)
 {
         kib_peer_t        *peer;
@@ -608,11 +929,7 @@ kibnal_get_conn_by_idx (int index)
                                         continue;
 
                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
-                                CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
-                                       conn, conn->ibc_state, 
-                                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                                       atomic_read (&conn->ibc_refcount));
-                                atomic_inc (&conn->ibc_refcount);
+                                kibnal_conn_addref(conn);
                                 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
                                                        flags);
                                 return (conn);
@@ -624,13 +941,90 @@ kibnal_get_conn_by_idx (int index)
         return (NULL);
 }
 
+int
+kibnal_conn_rts(kib_conn_t *conn, 
+                __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn)
+{
+        IB_PATH_RECORD         *path = &conn->ibc_cvars->cv_path;
+        IB_HANDLE               qp = conn->ibc_qp;
+        IB_QP_ATTRIBUTES_MODIFY modify_attr;
+        FSTATUS                 frc;
+        int                     rc;
+
+        if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources)
+                resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources;
+
+        if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth)
+                init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth;
+
+        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState       = QPStateReadyToRecv,
+                .RecvPSN            = IBNAL_STARTING_PSN,
+                .DestQPNumber       = qpn,
+                .ResponderResources = resp_res,
+                .MinRnrTimer        = UsecToRnrNakTimer(2000), /* 20 ms */
+                .Attrs              = (IB_QP_ATTR_RECVPSN |
+                                       IB_QP_ATTR_DESTQPNUMBER | 
+                                       IB_QP_ATTR_RESPONDERRESOURCES | 
+                                       IB_QP_ATTR_DESTAV | 
+                                       IB_QP_ATTR_PATHMTU | 
+                                       IB_QP_ATTR_MINRNRTIMER),
+        };
+        GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, 
+                      &modify_attr.DestAV);
+
+        frc = iibt_qp_modify(qp, &modify_attr, NULL);
+        if (frc != FSUCCESS) {
+                CERROR("Can't set QP %s ready to receive: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+                return -EIO;
+        }
+
+        rc = kibnal_post_receives(conn);
+        if (rc != 0) {
+                CERROR("Can't post receives for %s: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                return rc;
+        }
+
+        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState           = QPStateReadyToSend,
+                .FlowControl            = TRUE,
+                .InitiatorDepth         = init_depth,
+                .SendPSN                = psn,
+                .LocalAckTimeout        = path->PktLifeTime + 2, /* 2 or 1? */
+                .RetryCount             = IBNAL_RETRY,
+                .RnrRetryCount          = IBNAL_RNR_RETRY,
+                .Attrs                  = (IB_QP_ATTR_FLOWCONTROL | 
+                                           IB_QP_ATTR_INITIATORDEPTH | 
+                                           IB_QP_ATTR_SENDPSN | 
+                                           IB_QP_ATTR_LOCALACKTIMEOUT | 
+                                           IB_QP_ATTR_RETRYCOUNT | 
+                                           IB_QP_ATTR_RNRRETRYCOUNT),
+        };
+
+        frc = iibt_qp_modify(qp, &modify_attr, NULL);
+        if (frc != FSUCCESS) {
+                CERROR("Can't set QP %s ready to send: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+                return -EIO;
+        }
+
+        frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't query QP %s attributes: %d\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+                return -EIO;
+        }
+        
+        return 0;
+}
+
 kib_conn_t *
-kibnal_create_conn (void)
+kibnal_create_conn (lnet_nid_t nid)
 {
         kib_conn_t  *conn;
         int          i;
-        __u64        vaddr = 0;
-        __u64        vaddr_base;
         int          page_offset;
         int          ipage;
         int          rc;
@@ -642,13 +1036,16 @@ kibnal_create_conn (void)
         
         LIBCFS_ALLOC (conn, sizeof (*conn));
         if (conn == NULL) {
-                CERROR ("Can't allocate connection\n");
+                CERROR ("Can't allocate connection for %s\n",
+                        libcfs_nid2str(nid));
                 return (NULL);
         }
 
         /* zero flags, NULL pointers etc... */
         memset (conn, 0, sizeof (*conn));
+        conn->ibc_state = IBNAL_CONN_INIT_NOTHING;
 
+        INIT_LIST_HEAD (&conn->ibc_early_rxs);
         INIT_LIST_HEAD (&conn->ibc_tx_queue);
         INIT_LIST_HEAD (&conn->ibc_active_txs);
         spin_lock_init (&conn->ibc_lock);
@@ -656,34 +1053,39 @@ kibnal_create_conn (void)
         atomic_inc (&kibnal_data.kib_nconns);
         /* well not really, but I call destroy() on failure, which decrements */
 
-        LIBCFS_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
-        if (conn->ibc_rxs == NULL)
+        LIBCFS_ALLOC (conn->ibc_cvars, sizeof (*conn->ibc_cvars));
+        if (conn->ibc_cvars == NULL) {
+                CERROR ("Can't allocate connvars for %s\n", 
+                        libcfs_nid2str(nid));
                 goto failed;
-        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
+        }
+        memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars));
 
-        rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
-        if (rc != 0)
+        LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+        if (conn->ibc_rxs == NULL) {
+                CERROR("Cannot allocate RX descriptors for %s\n",
+                       libcfs_nid2str(nid));
                 goto failed;
+        }
+        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
-        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
-
+        rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES);
+        if (rc != 0) {
+                CERROR("Can't allocate RX buffers for %s\n",
+                       libcfs_nid2str(nid));
+                goto failed;
+        }
+        
         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
-                kib_rx_t   *rx = &conn->ibc_rxs[i];
+                kib_rx_t    *rx = &conn->ibc_rxs[i];
 
                 rx->rx_conn = conn;
                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
                              page_offset);
 
-                if (kibnal_whole_mem()) 
-                        rx->rx_vaddr = kibnal_page2phys(page) + 
-                                       page_offset + 
-                                       kibnal_data.kib_md.md_addr;
-                else
-                        rx->rx_vaddr = vaddr;
-                
-                vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
+                rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
+                                 kibnal_page2phys(page) + page_offset;
                 
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
@@ -697,7 +1099,7 @@ kibnal_create_conn (void)
 
         params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
                 .Type                    = QPTypeReliableConnected,
-                .SendQDepth              = IBNAL_TX_MAX_SG * 
+                .SendQDepth              = (IBNAL_MAX_RDMA_FRAGS + 1) * 
                                            IBNAL_MSG_QUEUE_SIZE,
                 .RecvQDepth              = IBNAL_MSG_QUEUE_SIZE,
                 .SendDSListDepth         = 1,
@@ -708,14 +1110,14 @@ kibnal_create_conn (void)
                 .SendSignaledCompletions = TRUE,
         };
         frc = iibt_qp_create(kibnal_data.kib_hca, &params.qp_create, NULL,
-                             &conn->ibc_qp, &conn->ibc_qp_attrs);
-        if (rc != 0) {
-                CERROR ("Failed to create queue pair: %d\n", rc);
+                             &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs);
+        if (frc != 0) {
+                CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc);
                 goto failed;
         }
 
         /* Mark QP created */
-        conn->ibc_state = IBNAL_CONN_INIT_QP;
+        kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP);
 
         params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
                 .RequestState             = QPStateInit,
@@ -724,16 +1126,24 @@ kibnal_create_conn (void)
                                              IB_QP_ATTR_ACCESSCONTROL),
                 .PortGUID                 = kibnal_data.kib_port_guid,
                 .PkeyIndex                = 0,
-                .AccessControl = {
+                .AccessControl = { 
                         .s = {
                                 .RdmaWrite = 1,
                                 .RdmaRead  = 1,
                         },
                 },
         };
-        rc = iibt_qp_modify(conn->ibc_qp, &params.qp_attr, NULL);
-        if (rc != 0) {
-                CERROR ("Failed to modify queue pair: %d\n", rc);
+        frc = iibt_qp_modify(conn->ibc_qp, &params.qp_attr, NULL);
+        if (frc != 0) {
+                CERROR ("Can't set QP %s state to INIT: %d\n",
+                        libcfs_nid2str(nid), frc);
+                goto failed;
+        }
+
+        frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't query QP %s attributes: %d\n",
+                        libcfs_nid2str(nid), frc);
                 goto failed;
         }
 
@@ -749,42 +1159,46 @@ kibnal_create_conn (void)
 void
 kibnal_destroy_conn (kib_conn_t *conn)
 {
-        int    rc;
+        int     rc;
         FSTATUS frc;
+
+        LASSERT (!in_interrupt());
         
         CDEBUG (D_NET, "connection %p\n", conn);
 
         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+        LASSERT (list_empty(&conn->ibc_early_rxs));
         LASSERT (list_empty(&conn->ibc_tx_queue));
         LASSERT (list_empty(&conn->ibc_active_txs));
         LASSERT (conn->ibc_nsends_posted == 0);
-        LASSERT (conn->ibc_connreq == NULL);
 
         switch (conn->ibc_state) {
-        case IBNAL_CONN_DISCONNECTED:
-                /* called after connection sequence initiated */
-                /* fall through */
-
-        case IBNAL_CONN_INIT_QP:
-                /* _destroy includes an implicit Reset of the QP which 
-                 * discards posted work */
-                rc = iibt_qp_destroy(conn->ibc_qp);
-                if (rc != 0)
-                        CERROR("Can't destroy QP: %d\n", rc);
-                /* fall through */
-                
         case IBNAL_CONN_INIT_NOTHING:
+        case IBNAL_CONN_INIT_QP:
+        case IBNAL_CONN_DISCONNECTED:
                 break;
 
         default:
-                LASSERT (0);
+                /* conn must either have never engaged with the CM, or have
+                 * completely disengaged from it */
+                CERROR("Bad conn %s state %d\n",
+                       (conn->ibc_peer) == NULL ? "<anon>" :
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state);
+                LBUG();
         }
 
         if (conn->ibc_cep != NULL) {
                 frc = iibt_cm_destroy_cep(conn->ibc_cep);
-                if (frc != 0)
-                        CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, 
-                               frc);
+                if (frc != FSUCCESS)
+                        CERROR("Error destroying CEP %p: %d\n",
+                               conn->ibc_cep, frc);
+        }
+
+        if (conn->ibc_qp != NULL) {
+                frc = iibt_qp_destroy(conn->ibc_qp);
+                if (frc != FSUCCESS)
+                        CERROR("Error destroying QP %p: %d\n",
+                               conn->ibc_qp, frc);
         }
 
         if (conn->ibc_rx_pages != NULL) 
@@ -794,48 +1208,18 @@ kibnal_destroy_conn (kib_conn_t *conn)
                 LIBCFS_FREE(conn->ibc_rxs, 
                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
+        if (conn->ibc_cvars != NULL)
+                LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
+
         if (conn->ibc_peer != NULL)
-                kib_peer_decref(conn->ibc_peer);
+                kibnal_peer_decref(conn->ibc_peer);
 
         LIBCFS_FREE(conn, sizeof (*conn));
 
         atomic_dec(&kibnal_data.kib_nconns);
-        
-        if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
-            kibnal_data.kib_shutdown) {
-                /* I just nuked the last connection on shutdown; wake up
-                 * everyone so they can exit. */
-                wake_up_all(&kibnal_data.kib_sched_waitq);
-                wake_up_all(&kibnal_data.kib_connd_waitq);
-        }
-}
-
-void
-kibnal_put_conn (kib_conn_t *conn)
-{
-        unsigned long flags;
-
-        CDEBUG (D_NET, "putting conn[%p] state %d -> %s (%d)\n",
-                conn, conn->ibc_state, 
-                libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                atomic_read (&conn->ibc_refcount));
-
-        LASSERT (atomic_read (&conn->ibc_refcount) > 0);
-        if (!atomic_dec_and_test (&conn->ibc_refcount))
-                return;
-
-        /* must disconnect before dropping the final ref */
-        LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
-
-        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
-
-        list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
-        wake_up (&kibnal_data.kib_connd_waitq);
-
-        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 }
 
-static int
+int
 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
 {
         kib_conn_t         *conn;
@@ -878,7 +1262,7 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
         return (count);
 }
 
-static int
+int
 kibnal_close_matching_conns (lnet_nid_t nid)
 {
         unsigned long       flags;
@@ -935,7 +1319,7 @@ kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
         switch(cmd) {
         case IOC_LIBCFS_GET_PEER: {
                 lnet_nid_t   nid = 0;
-                int         share_count = 0;
+                int          share_count = 0;
 
                 rc = kibnal_get_peer_info(data->ioc_count,
                                           &nid, &share_count);
@@ -959,7 +1343,7 @@ kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
                 else {
                         rc = 0;
                         data->ioc_nid = conn->ibc_peer->ibp_nid;
-                        kibnal_put_conn (conn);
+                        kibnal_conn_decref(conn);
                 }
                 break;
         }
@@ -968,10 +1352,14 @@ kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
                 break;
         }
         case IOC_LIBCFS_REGISTER_MYNID: {
-                if (data->ioc_nid == LNET_NID_ANY)
+                if (ni->ni_nid == data->ioc_nid) {
+                        rc = 0;
+                } else {
+                        CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                               libcfs_nid2str(data->ioc_nid),
+                               libcfs_nid2str(ni->ni_nid));
                         rc = -EINVAL;
-                else
-                        rc = kibnal_set_mynid (data->ioc_nid);
+                }
                 break;
         }
         }
@@ -983,15 +1371,8 @@ void
 kibnal_free_pages (kib_pages_t *p)
 {
         int     npages = p->ibp_npages;
-        int     rc;
         int     i;
         
-        if (p->ibp_mapped) {
-                rc = iibt_deregister_memory(p->ibp_handle);
-                if (rc != 0)
-                        CERROR ("Deregister error: %d\n", rc);
-        }
-        
         for (i = 0; i < npages; i++)
                 if (p->ibp_pages[i] != NULL)
                         __free_page(p->ibp_pages[i]);
@@ -1000,19 +1381,10 @@ kibnal_free_pages (kib_pages_t *p)
 }
 
 int
-kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
+kibnal_alloc_pages (kib_pages_t **pp, int npages)
 {
-        kib_pages_t                *p;
-        __u64                      *phys_pages;
-        int                         i;
-        FSTATUS                     frc;
-        IB_ACCESS_CONTROL           access;
-
-        memset(&access, 0, sizeof(access));
-        access.s.MWBindable = 1;
-        access.s.LocalWrite = 1;
-        access.s.RdmaRead = 1;
-        access.s.RdmaWrite = 1;
+        kib_pages_t   *p;
+        int            i;
 
         LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
@@ -1032,123 +1404,215 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
                 }
         }
 
-        if (kibnal_whole_mem())
-                goto out;
-
-        LIBCFS_ALLOC(phys_pages, npages * sizeof(*phys_pages));
-        if (phys_pages == NULL) {
-                CERROR ("Can't allocate physarray for %d pages\n", npages);
-                /* XXX free ibp_pages? */
-                kibnal_free_pages(p);
-                return (-ENOMEM);
-        }
+        *pp = p;
+        return (0);
+}
 
-        /* if we were using the _contig_ registration variant we would have
-         * an array of PhysAddr/Length pairs, but the discontiguous variant
-         * just takes the PhysAddr */
-        for (i = 0; i < npages; i++)
-                phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
-
-        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
-                                            0,          /* requested vaddr */
-                                            phys_pages, npages,
-                                            0,          /* offset */
-                                            kibnal_data.kib_pd,
-                                            access,
-                                            &p->ibp_handle, &p->ibp_vaddr,
-                                            &p->ibp_lkey, &p->ibp_rkey);
+int
+kibnal_alloc_tx_descs (void) 
+{
+        int    i;
         
-        LIBCFS_FREE(phys_pages, npages * sizeof(*phys_pages));
+        LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS() * sizeof(kib_tx_t));
+        if (kibnal_data.kib_tx_descs == NULL)
+                return -ENOMEM;
         
-        if (frc != FSUCCESS) {
-                CERROR ("Error %d mapping %d pages\n", frc, npages);
-                kibnal_free_pages(p);
-                return (-ENOMEM);
+        memset(kibnal_data.kib_tx_descs, 0,
+               IBNAL_TX_MSGS() * sizeof(kib_tx_t));
+
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
+                kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
+
+#if IBNAL_USE_FMR
+                LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
+                             sizeof(*tx->tx_pages));
+                if (tx->tx_pages == NULL)
+                        return -ENOMEM;
+#else
+                LIBCFS_ALLOC(tx->tx_wrq, 
+                             (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                             sizeof(*tx->tx_wrq));
+                if (tx->tx_wrq == NULL)
+                        return -ENOMEM;
+                
+                LIBCFS_ALLOC(tx->tx_gl, 
+                             (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                             sizeof(*tx->tx_gl));
+                if (tx->tx_gl == NULL)
+                        return -ENOMEM;
+                
+                LIBCFS_ALLOC(tx->tx_rd, 
+                             offsetof(kib_rdma_desc_t, 
+                                      rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+                if (tx->tx_rd == NULL)
+                        return -ENOMEM;
+#endif
         }
 
-        CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
-                      "lkey %x rkey %x\n", npages, p->ibp_handle,
-                      p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
-        
-        p->ibp_mapped = 1;
-out:
-        *pp = p;
-        return (0);
+        return 0;
+}
+
+void
+kibnal_free_tx_descs (void) 
+{
+        int    i;
+
+        if (kibnal_data.kib_tx_descs == NULL)
+                return;
+
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
+                kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
+
+#if IBNAL_USE_FMR
+                if (tx->tx_pages != NULL)
+                        LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
+                                    sizeof(*tx->tx_pages));
+#else
+                if (tx->tx_wrq != NULL)
+                        LIBCFS_FREE(tx->tx_wrq, 
+                                    (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                                    sizeof(*tx->tx_wrq));
+
+                if (tx->tx_gl != NULL)
+                        LIBCFS_FREE(tx->tx_gl, 
+                                    (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                                    sizeof(*tx->tx_gl));
+
+                if (tx->tx_rd != NULL)
+                        LIBCFS_FREE(tx->tx_rd, 
+                                    offsetof(kib_rdma_desc_t, 
+                                             rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+#endif
+        }
+
+        LIBCFS_FREE(kibnal_data.kib_tx_descs,
+                    IBNAL_TX_MSGS() * sizeof(kib_tx_t));
 }
 
-static int
+int
 kibnal_setup_tx_descs (void)
 {
         int           ipage = 0;
         int           page_offset = 0;
-        __u64         vaddr;
-        __u64         vaddr_base;
         struct page  *page;
         kib_tx_t     *tx;
         int           i;
         int           rc;
 
         /* pre-mapped messages are not bigger than 1 page */
-        LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+        CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
 
         /* No fancy arithmetic when we do the buffer calculations */
-        LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+        CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
 
-        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
-                                0);
+        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
+                                IBNAL_TX_MSG_PAGES());
         if (rc != 0)
                 return (rc);
 
-        /* ignored for the whole_mem case */
-        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
-
-        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
                 tx = &kibnal_data.kib_tx_descs[i];
 
-                memset (tx, 0, sizeof(*tx));    /* zero flags etc */
-                
+#if IBNAL_USE_FMR
+                /* Allocate an FMR for this TX so it can map src/sink buffers
+                 * for large transfers */
+#endif
                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
                                             page_offset);
 
-                if (kibnal_whole_mem()) 
-                        tx->tx_vaddr = kibnal_page2phys(page) + 
-                                       page_offset + 
-                                       kibnal_data.kib_md.md_addr;
-                else
-                        tx->tx_vaddr = vaddr;
-
-                tx->tx_mapped = KIB_TX_UNMAPPED;
+                tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
+                                 kibnal_page2phys(page) + page_offset;
 
                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
-                       i, tx, tx->tx_msg, tx->tx_vaddr);
+                       i, tx, tx->tx_msg, tx->tx_hca_msg);
 
                 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
 
-                vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
-
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
                 }
         }
         
         return (0);
 }
 
+int
+kibnal_register_all_memory(void)
+{
+        /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous
+         * chunk starting at 0 */
+        struct sysinfo     si;
+        __u64              total;
+        __u64              roundup = (128<<20);     /* round up in big chunks */
+        IB_MR_PHYS_BUFFER  phys;
+        IB_ACCESS_CONTROL  access;
+        FSTATUS            frc;
+        int                rc;
+
+        memset(&access, 0, sizeof(access));
+        access.s.MWBindable = 1;
+        access.s.LocalWrite = 1;
+        access.s.RdmaRead = 1;
+        access.s.RdmaWrite = 1;
+
+        /* XXX we don't bother with first-gen cards */
+        if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 && 
+            kibnal_data.kib_hca_attrs.DeviceId == 0x3101) {
+                CERROR("Can't register all memory on first generation HCAs\n");
+                return -EINVAL;
+        }
+
+        si_meminfo(&si);
+        total = ((__u64)si.totalram) * si.mem_unit;
+
+        if (total < ((__u64)max_mapnr) * PAGE_SIZE)
+                total = ((__u64)max_mapnr) * PAGE_SIZE;
+
+        if (total == 0) {
+                CERROR("Can't determine memory size\n");
+                return -ENOMEM;
+        }
+        
+        roundup = (128<<20);
+        total = (total + (roundup - 1)) & ~(roundup - 1);
+
+        phys.PhysAddr = 0;
+        phys.Length = total;
+
+        frc = iibt_register_contig_physical_memory(
+                kibnal_data.kib_hca, 0, &phys, 1, 0,
+                kibnal_data.kib_pd, access,
+                &kibnal_data.kib_whole_mem.md_handle,
+                &kibnal_data.kib_whole_mem.md_addr,
+                &kibnal_data.kib_whole_mem.md_lkey,
+                &kibnal_data.kib_whole_mem.md_rkey);
+
+        if (frc != FSUCCESS) {
+                CERROR("registering physical memory failed: %d\n", frc);
+                return -EIO;
+        }
+
+        CDEBUG(D_WARNING, "registered phys mem from "LPX64" for "LPU64"\n", 
+               phys.PhysAddr, phys.Length);
+
+        return 0;
+}
+
 void
 kibnal_shutdown (lnet_ni_t *ni)
 {
         int   i;
         int   rc;
 
-        LASSERT (ni->ni_data == &kibnal_data);
         LASSERT (ni == kibnal_data.kib_ni);
+        LASSERT (ni->ni_data == &kibnal_data);
        
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
                atomic_read (&libcfs_kmemory));
@@ -1159,16 +1623,16 @@ kibnal_shutdown (lnet_ni_t *ni)
                 LBUG();
 
         case IBNAL_INIT_ALL:
-                /* resetting my NID to unadvertises me, removes my
-                 * listener and nukes all current peers */
-                kibnal_set_mynid (LNET_NID_ANY);
+                /* stop accepting connections, prevent new peers and start to
+                 * tear down all existing ones... */
+                kibnal_stop_listener(1);
 
-                /* Wait for all peer state to clean up (crazy) */
+                /* Wait for all peer state to clean up */
                 i = 2;
                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
-                               "waiting for %d peers to disconnect (can take a few seconds)\n",
+                               "waiting for %d peers to disconnect\n",
                                atomic_read (&kibnal_data.kib_npeers));
                         set_current_state (TASK_UNINTERRUPTIBLE);
                         schedule_timeout (HZ);
@@ -1185,21 +1649,12 @@ kibnal_shutdown (lnet_ni_t *ni)
                 kibnal_free_pages (kibnal_data.kib_tx_pages);
                 /* fall through */
 
-        case IBNAL_INIT_MR:
-                if (kibnal_data.kib_md.md_handle != NULL) {
-                        rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
-                        if (rc != FSUCCESS)
-                                CERROR ("Deregister memory: %d\n", rc);
-                }
+        case IBNAL_INIT_MD:
+                rc = iibt_deregister_memory(kibnal_data.kib_whole_mem.md_handle);
+                if (rc != FSUCCESS)
+                        CERROR ("Deregister memory: %d\n", rc);
                 /* fall through */
 
-#if IBNAL_FMR
-        case IBNAL_INIT_FMR:
-                rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
-                if (rc != 0)
-                        CERROR ("Destroy FMR pool error: %d\n", rc);
-                /* fall through */
-#endif
         case IBNAL_INIT_PD:
                 rc = iibt_pd_free(kibnal_data.kib_pd);
                 if (rc != 0)
@@ -1212,10 +1667,6 @@ kibnal_shutdown (lnet_ni_t *ni)
                         CERROR ("Deregister SD error: %d\n", rc);
                 /* fall through */
 
-        case IBNAL_INIT_PORT:
-                /* XXX ??? */
-                /* fall through */
-
         case IBNAL_INIT_PORTATTRS:
                 LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
                             kibnal_data.kib_hca_attrs.PortAttributesListSize);
@@ -1228,16 +1679,13 @@ kibnal_shutdown (lnet_ni_t *ni)
                 /* fall through */
 
         case IBNAL_INIT_DATA:
-                /* Module refcount only gets to zero when all peers
-                 * have been closed so all lists must be empty */
                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
                 LASSERT (kibnal_data.kib_peers != NULL);
                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
                 }
                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
-                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
-                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+                LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
 
@@ -1261,9 +1709,7 @@ kibnal_shutdown (lnet_ni_t *ni)
                 break;
         }
 
-        if (kibnal_data.kib_tx_descs != NULL)
-                LIBCFS_FREE (kibnal_data.kib_tx_descs,
-                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        kibnal_free_tx_descs();
 
         if (kibnal_data.kib_peers != NULL)
                 LIBCFS_FREE (kibnal_data.kib_peers,
@@ -1277,32 +1723,19 @@ kibnal_shutdown (lnet_ni_t *ni)
         PORTAL_MODULE_UNUSE;
 }
 
-#define roundup_power(val, power) \
-        ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
-
-/* this isn't very portable or sturdy in the face of funny mem/bus configs */
-static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
-{
-        struct sysinfo si;
-        __u64 ret;
-
-        /* XXX we don't bother with first-gen cards */
-        if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
-                return 0ULL;
-
-        si_meminfo(&si);
-        ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
-        return roundup_power(ret, 128 * 1024 * 1024);
-} 
-#undef roundup_power
-
 int
 kibnal_startup (lnet_ni_t *ni)
 {
+        char                ipif_name[32];
+        __u32               ip;
+        __u32               netmask;
+        int                 up;
+        int                 nob;
+        struct timeval      tv;
         IB_PORT_ATTRIBUTES *pattr;
         FSTATUS             frc;
         int                 rc;
-        int                 n;
+        __u32               n;
         int                 i;
 
         LASSERT (ni->ni_lnd == &the_kiblnd);
@@ -1313,37 +1746,81 @@ kibnal_startup (lnet_ni_t *ni)
                 return -EPERM;
         }
 
-        if (ni->ni_interfaces[0] != NULL) {
-                CERROR("Explicit interface config not supported\n");
-                return -EPERM;
-        }
-        
         if (IBNAL_CREDITS > IBNAL_NTX) {
                 CERROR ("Can't set credits(%d) > ntx(%d)\n",
                         IBNAL_CREDITS, IBNAL_NTX);
                 return -EINVAL;
         }
         
-        ni->ni_maxtxcredits = IBNAL_CREDITS;
-        ni->ni_peertxcredits = IBNAL_PEERCREDITS;
+        if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
+                CERROR ("Can't set credits(%d) > ntx(%d)\n",
+                        *kibnal_tunables.kib_credits,
+                        *kibnal_tunables.kib_ntx);
+                return -EINVAL;
+        }
+
+        ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
+        ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
+
+        CLASSERT (LNET_MAX_INTERFACES > 1);
+
+        if (ni->ni_interfaces[0] == NULL) {
+                kibnal_data.kib_hca_idx = 0;
+        } else {
+                /* Use the HCA specified in 'networks=' */
+                if (ni->ni_interfaces[1] != NULL) {
+                        CERROR("Multiple interfaces not supported\n");
+                        return -EPERM;
+                }
+                
+                /* Parse <number> into kib_hca_idx */
+                nob = strlen(ni->ni_interfaces[0]);
+                if (sscanf(ni->ni_interfaces[0], "%d%n", 
+                           &kibnal_data.kib_hca_idx, &nob) < 1 ||
+                    nob != strlen(ni->ni_interfaces[0])) {
+                        CERROR("Can't parse interface '%s'\n",
+                               ni->ni_interfaces[0]);
+                        return -EINVAL;
+                }
+        }
+        
+        /* Find IP address from <ipif base name><number> */
+        snprintf(ipif_name, sizeof(ipif_name), "%s%d",
+                 *kibnal_tunables.kib_ipif_basename, kibnal_data.kib_hca_idx);
+        if (strlen(ipif_name) == sizeof(ipif_name - 1)) {
+                CERROR("IPoIB interface name %s truncated\n", ipif_name);
+                return -EINVAL;
+        }
+        
+        rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
+        if (rc != 0) {
+                CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
+                return -ENETDOWN;
+        }
+        
+        if (!up) {
+                CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
+                return -ENETDOWN;
+        }
+        
+        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
 
         ni->ni_data = &kibnal_data;
         kibnal_data.kib_ni = ni;
 
+        do_gettimeofday(&tv);
+        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
         frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, 
                                        &kibnal_data.kib_interfaces);
         if (frc != FSUCCESS) {
                 CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
-                        frc);
+                       frc);
                 return -ENOSYS;
         }
 
         PORTAL_MODULE_USE;
 
-        init_MUTEX (&kibnal_data.kib_nid_mutex);
-        init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
-        kibnal_data.kib_ni->ni_nid = LNET_NID_ANY;
-
         rwlock_init(&kibnal_data.kib_global_lock);
 
         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
@@ -1358,20 +1835,18 @@ kibnal_startup (lnet_ni_t *ni)
         spin_lock_init (&kibnal_data.kib_connd_lock);
         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
 
         spin_lock_init (&kibnal_data.kib_sched_lock);
-        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
-        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
 
         spin_lock_init (&kibnal_data.kib_tx_lock);
         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
 
-        LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
-                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
-        if (kibnal_data.kib_tx_descs == NULL) {
-                CERROR ("Can't allocate tx descs\n");
+        rc = kibnal_alloc_tx_descs();
+        if (rc != 0) {
+                CERROR("Can't allocate tx descs\n");
                 goto failed;
         }
 
@@ -1379,6 +1854,10 @@ kibnal_startup (lnet_ni_t *ni)
         kibnal_data.kib_init = IBNAL_INIT_DATA;
         /*****************************************************/
 
+        kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries;
+        kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/
+                                          *kibnal_tunables.kib_sd_retries;
+
         for (i = 0; i < IBNAL_N_SCHED; i++) {
                 rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
                 if (rc != 0) {
@@ -1398,22 +1877,30 @@ kibnal_startup (lnet_ni_t *ni)
             sizeof(kibnal_data.kib_hca_guids[0]);
         frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
         if (frc != FSUCCESS) {
-                CERROR ("Can't get channel adapter guids: %d\n", frc);
+                CERROR ("Can't get HCA guids: %d\n", frc);
                 goto failed;
         }
+
         if (n == 0) {
-                CERROR ("No channel adapters found\n");
+                CERROR ("No HCAs found\n");
                 goto failed;
         }
 
-        /* Infinicon has per-HCA rather than per CQ completion handlers */
-        frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
-                            kibnal_ca_callback,
-                            kibnal_ca_async_callback,
-                            &kibnal_data.kib_hca,
+        if (n <= kibnal_data.kib_hca_idx) {
+                CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n",
+                       kibnal_data.kib_hca_idx, n - 1);
+                goto failed;
+        }
+        
+        /* Infinicon has per-HCA notification callbacks */
+        frc = iibt_open_hca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx],
+                            kibnal_hca_callback,
+                            kibnal_hca_async_callback,
+                            NULL,
                             &kibnal_data.kib_hca);
         if (frc != FSUCCESS) {
-                CERROR ("Can't open CA[0]: %d\n", frc);
+                CERROR ("Can't open HCA[%d]: %d\n", 
+                        kibnal_data.kib_hca_idx, frc);
                 goto failed;
         }
         
@@ -1442,7 +1929,8 @@ kibnal_startup (lnet_ni_t *ni)
         frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
                              NULL);
         if (frc != FSUCCESS) {
-                CERROR ("Can't get port attrs for CA 0: %d\n", frc);
+                CERROR ("Can't get port attrs for HCA %d: %d\n",
+                        kibnal_data.kib_hca_idx, frc);
                 goto failed;
         }
 
@@ -1481,10 +1969,6 @@ kibnal_startup (lnet_ni_t *ni)
 
         CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
         
-        /* Active port found */
-        kibnal_data.kib_init = IBNAL_INIT_PORT;
-        /*****************************************************/
-
         frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
         if (frc != FSUCCESS) {
                 CERROR ("Can't register with SD: %d\n", frc);
@@ -1505,73 +1989,14 @@ kibnal_startup (lnet_ni_t *ni)
         kibnal_data.kib_init = IBNAL_INIT_PD;
         /*****************************************************/
 
-#if IBNAL_FMR
-        {
-                const int pool_size = IBNAL_NTX;
-                struct ib_fmr_pool_param params = {
-                        .max_pages_per_fmr = LNET_MTU/PAGE_SIZE,
-                        .access            = (IB_ACCESS_LOCAL_WRITE |
-                                              IB_ACCESS_REMOTE_WRITE |
-                                              IB_ACCESS_REMOTE_READ),
-                        .pool_size         = pool_size,
-                        .dirty_watermark   = (pool_size * 3)/4,
-                        .flush_function    = NULL,
-                        .flush_arg         = NULL,
-                        .cache             = 1,
-                };
-                rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
-                                        &kibnal_data.kib_fmr_pool);
-                if (rc != 0) {
-                        CERROR ("Can't create FMR pool size %d: %d\n", 
-                                pool_size, rc);
-                        goto failed;
-                }
-        }
-
-        /* flag FMR pool initialised */
-        kibnal_data.kib_init = IBNAL_INIT_FMR;
-#endif
-        /*****************************************************/
-        if (IBNAL_WHOLE_MEM) {
-                IB_MR_PHYS_BUFFER phys;
-                IB_ACCESS_CONTROL access;
-                kib_md_t *md = &kibnal_data.kib_md;
-
-                memset(&access, 0, sizeof(access));
-                access.s.MWBindable = 1;
-                access.s.LocalWrite = 1;
-                access.s.RdmaRead = 1;
-                access.s.RdmaWrite = 1;
-
-                phys.PhysAddr = 0;
-                phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
-                if (phys.Length == 0) {
-                        CERROR ("couldn't determine the end of phys mem\n");
-                        goto failed;
-                }
-       
-                rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
-                                                          0,
-                                                          &phys, 1,
-                                                          0,
-                                                          kibnal_data.kib_pd,
-                                                          access,
-                                                          &md->md_handle,
-                                                          &md->md_addr,
-                                                          &md->md_lkey,
-                                                          &md->md_rkey);
-                if (rc != FSUCCESS) {
-                        CERROR("registering physical memory failed: %d\n", 
-                               rc);
-                        CERROR("falling back to registration per-rdma\n");
-                        md->md_handle = NULL;
-                } else {
-                        CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
-                               phys.Length);
-                        kibnal_data.kib_init = IBNAL_INIT_MR;
-                }
+        rc = kibnal_register_all_memory();
+        if (rc != 0) {
+                CERROR ("Can't register all memory\n");
+                goto failed;
         }
-
+        
+        /* flag whole memory MD initialised */
+        kibnal_data.kib_init = IBNAL_INIT_MD;
         /*****************************************************/
 
         rc = kibnal_setup_tx_descs();
@@ -1584,33 +2009,36 @@ kibnal_startup (lnet_ni_t *ni)
         kibnal_data.kib_init = IBNAL_INIT_TXD;
         /*****************************************************/
         
-        {
-                uint32 nentries;
-
-                frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
-                                     &kibnal_data.kib_cq, &kibnal_data.kib_cq,
-                                     &nentries);
-                if (frc != FSUCCESS) {
-                        CERROR ("Can't create RX CQ: %d\n", frc);
-                        goto failed;
-                }
-
-                /* flag CQ initialised */
-                kibnal_data.kib_init = IBNAL_INIT_CQ;
+        frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
+                             &kibnal_data.kib_cq, &kibnal_data.kib_cq,
+                             &n);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't create RX CQ: %d\n", frc);
+                goto failed;
+        }
 
-                if (nentries < IBNAL_CQ_ENTRIES) {
-                        CERROR ("CQ only has %d entries, need %d\n", 
-                                nentries, IBNAL_CQ_ENTRIES);
-                        goto failed;
-                }
+        /* flag CQ initialised */
+        kibnal_data.kib_init = IBNAL_INIT_CQ;
+        /*****************************************************/
+        
+        if (n < IBNAL_CQ_ENTRIES()) {
+                CERROR ("CQ only has %d entries: %d needed\n", 
+                        n, IBNAL_CQ_ENTRIES());
+                goto failed;
+        }
 
-                rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
-                if (rc != 0) {
-                        CERROR ("Failed to re-arm completion queue: %d\n", rc);
-                        goto failed;
-                }
+        rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
+        if (rc != 0) {
+                CERROR ("Failed to re-arm completion queue: %d\n", rc);
+                goto failed;
         }
         
+        rc = kibnal_start_listener();
+        if (rc != 0) {
+                CERROR("Can't start listener: %d\n", rc);
+                goto failed;
+        }
+
         /* flag everything initialised */
         kibnal_data.kib_init = IBNAL_INIT_ALL;
         /*****************************************************/
@@ -1625,11 +2053,8 @@ kibnal_startup (lnet_ni_t *ni)
 void __exit
 kibnal_module_fini (void)
 {
-#ifdef CONFIG_SYSCTL
-        if (kibnal_tunables.kib_sysctl != NULL)
-                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
-#endif
         lnet_unregister_lnd(&the_kiblnd);
+        kibnal_tunables_fini();
 }
 
 int __init
@@ -1642,32 +2067,17 @@ kibnal_module_init (void)
                 return -ENODEV;
         }
         
-        if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
-                CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
-                return -EINVAL;
-        }
-
-        /* the following must be sizeof(int) for proc_dointvec() */
-        if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
-                CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
-                return -EINVAL;
-        }
-
-        /* Initialise dynamic tunables to defaults once only */
-        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+        rc = kibnal_tunables_init();
+        if (rc != 0)
+                return rc;
 
         lnet_register_lnd(&the_kiblnd);
-        
-#ifdef CONFIG_SYSCTL
-        /* Press on regardless even if registering sysctl doesn't work */
-        kibnal_tunables.kib_sysctl = 
-                register_sysctl_table (kibnal_top_ctl_table, 0);
-#endif
-        return (0);
+
+        return 0;
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
+MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00");
 MODULE_LICENSE("GPL");
 
 module_init(kibnal_module_init);
index 6585f51..6d796b9 100644 (file)
 #error Invalid GCC version. Must use GCC >= 3.2.3
 #endif
 
-#define IBNAL_SERVICE_NAME   "iibnal"
-#define IBNAL_SERVICE_NUMBER 0x11b9a1
-
 #if CONFIG_SMP
 # define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
 #else
 # define IBNAL_N_SCHED      1                   /* # schedulers */
 #endif
 
-#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
-#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
-
-#define IBNAL_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
-
-#define IBNAL_MSG_QUEUE_SIZE   8                /* # messages/RDMAs in-flight */
-#define IBNAL_CREDIT_HIGHWATER 7                /* when to eagerly return credits */
+#define IBNAL_FMR                    0          /* map on demand v. use whole mem mapping */
+
+/* defaults for modparams/tunables */
+#define IBNAL_IPIF_BASENAME          "ib"       /* IPoIB interface basename */
+#define IBNAL_SERVICE_NAME           "iibnal"   /* global service name */
+#define IBNAL_SERVICE_NUMBER         0x11b9a2   /* global service number */
+#define IBNAL_MIN_RECONNECT_INTERVAL 1          /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL 60         /* ...exponentially increasing to this */
+#define IBNAL_CONCURRENT_PEERS       1152       /* # nodes all talking at once to me */
+#define IBNAL_CKSUM                  0          /* checksum kib_msg_t? */
+#define IBNAL_TIMEOUT                50         /* default comms timeout (seconds) */
+#define IBNAL_NTX                    256        /* # tx descs */
+#define IBNAL_CREDITS                128        /* # concurrent sends */
+#define IBNAL_PEERCREDITS            8          /* # concurrent sends to 1 peer */
+#define IBNAL_SD_RETRIES             8          /* # times to retry SD reqs */
+#define IBNAL_LISTEN_BACKLOG         64         /* listener backlog */
+
+/* tunables fixed at compile time */
+#define IBNAL_PEER_HASH_SIZE         101        /* # peer lists */
+#define IBNAL_RESCHED                100        /* # scheduler loops before reschedule */
+#define IBNAL_MSG_QUEUE_SIZE         8          /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER       7          /* when to eagerly return credits */
+#define IBNAL_MSG_SIZE              (4<<10)     /* max size of queued messages (inc hdr) */
+#define IBNAL_RDMA_BASE              0x0eeb0000
+#define IBNAL_STARTING_PSN           1
+
+/* QP tunables */
 /* 7 indicates infinite retry attempts, Infinicon recommended 5 */
-#define IBNAL_RETRY            5                /* # times to retry */
-#define IBNAL_RNR_RETRY        5                /*  */
-#define IBNAL_CM_RETRY         5                /* # times to retry connection */
-#define IBNAL_FLOW_CONTROL     1
-#define IBNAL_ACK_TIMEOUT       20              /* supposedly 4 secs */
-
-#define IBNAL_NTX             64                /* # tx descs */
-/* this had to be dropped down so that we only register < 255 pages per
- * region.  this will change if we register all memory. */
-#define IBNAL_CREDITS         32
-#define IBNAL_PEERCREDITS     8
-
-#define IBNAL_PEER_HASH_SIZE  101               /* # peer lists */
-
-#define IBNAL_RESCHED         100               /* # scheduler loops before reschedule */
-
-#define IBNAL_CONCURRENT_PEERS 1000             /* # nodes all talking at once to me */
-
-/* default vals for runtime tunables */
-#define IBNAL_IO_TIMEOUT      50                /* default comms timeout (seconds) */
+#define IBNAL_RETRY                  5          /* # times to retry */
+#define IBNAL_RNR_RETRY              5          /*  */
+#define IBNAL_CM_RETRY               5          /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL           1
+#define IBNAL_ACK_TIMEOUT            20         /* supposedly 4 secs */
+#define IBNAL_EE_FLOW                1
+#define IBNAL_LOCAL_SUB              1
+#define IBNAL_FAILOVER_ACCEPTED      0
 
 /************************/
 /* derived constants... */
 
 /* TX messages (shared by all connections) */
-#define IBNAL_TX_MSGS       IBNAL_NTX
-#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_TX_MSGS()       (*kibnal_tunables.kib_ntx)
+#define IBNAL_TX_MSG_BYTES()  (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES()  ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
 
-#define IBNAL_TX_MAX_SG (LNET_MAX_IOV + 1)
+#if IBNAL_USE_FMR
+# define IBNAL_MAX_RDMA_FRAGS 1
+#else
+# define IBNAL_MAX_RDMA_FRAGS LNET_MAX_IOV
+#endif
 
 /* RX messages (per connection) */
-#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
-#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_RX_MSGS         IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES    (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES    ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
-
-/* we may have up to 2 completions per transmit +
-   1 completion per receive, per connection */
-#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
-                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
-
-#define IBNAL_RDMA_BASE  0x0eeb0000
-#define IBNAL_FMR        0
-#define IBNAL_WHOLE_MEM  1
-#define IBNAL_CKSUM      0
-//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
-#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
-
-/* XXX I have no idea. */
-#define IBNAL_STARTING_PSN 1
+#define IBNAL_CQ_ENTRIES()  (IBNAL_TX_MSGS() * (1 + IBNAL_MAX_RDMA_FRAGS) +             \
+                             (IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers))
 
 typedef struct
 {
-        int               kib_io_timeout;       /* comms timeout (seconds) */
-        int              *kib_min_reconnect_interval; /* first failed connection retry... */
-        int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+        char            **kib_hca_basename;     /* HCA base name */
+        char            **kib_ipif_basename;    /* IPoIB interface base name */
+        char            **kib_service_name;     /* global service name */
+        unsigned int     *kib_service_number;   /* global service number */
+        int              *kib_min_reconnect_interval; /* min connect retry seconds... */
+        int              *kib_max_reconnect_interval; /* max connect retry seconds */
+        int              *kib_concurrent_peers; /* max # peers */
+        int              *kib_cksum;            /* checksum kib_msg_t? */
+        int              *kib_timeout;          /* comms timeout (seconds) */
+        int              *kib_ntx;              /* # tx descs */
+        int              *kib_credits;          /* # concurrent sends */
+        int              *kib_peercredits;      /* # concurrent sends to 1 peer */
+        int              *kib_sd_retries;       /* # concurrent sends to 1 peer */
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
         struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+#endif
 } kib_tunables_t;
 
-/* some of these have specific types in the stack that just map back
- * to the uFOO types, like IB_{L,R}_KEY. */
+/* NB The Infinicon stack has specific typedefs for some things
+ * (e.g. IB_{L,R}_KEY), that just map back to __u32 etc */
 typedef struct
 {
         int               ibp_npages;           /* # pages */
-        int               ibp_mapped;           /* mapped? */
-        __u64             ibp_vaddr;            /* mapped region vaddr */
-        __u32             ibp_lkey;             /* mapped region lkey */
-        __u32             ibp_rkey;             /* mapped region rkey */
-        IB_HANDLE         ibp_handle;           /* mapped region handle */
         struct page      *ibp_pages[0];
 } kib_pages_t;
 
@@ -172,31 +175,29 @@ typedef struct
         __u64             kib_incarnation;      /* which one am I */
         int               kib_shutdown;         /* shut down? */
         atomic_t          kib_nthreads;         /* # live threads */
-        lnet_ni_t        *kib_ni;               /* _the_ nal instance */
+        lnet_ni_t        *kib_ni;               /* _the_ iib instance */
 
-        __u64             kib_service_id;       /* service number I listen on */
         __u64             kib_port_guid;        /* my GUID (lo 64 of GID)*/
         __u16             kib_port_pkey;        /* my pkey, whatever that is */
-        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
-        struct semaphore  kib_nid_signal;       /* signal completion */
-        IB_HANDLE         kib_cep;              /* connection end point */
+        struct semaphore  kib_listener_signal;  /* signal completion */
+        IB_HANDLE         kib_listener_cep;     /* connection end point */
 
         rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
+        int               kib_ready;            /* CQ callback fired */
+        int               kib_checking_cq;      /* a scheduler is checking the CQ */
 
         struct list_head *kib_peers;            /* hash table of all my known peers */
         int               kib_peer_hash_size;   /* size of kib_peers */
         atomic_t          kib_npeers;           /* # peers extant */
         atomic_t          kib_nconns;           /* # connections extant */
 
+        struct list_head  kib_connd_zombies;    /* connections to free */
         struct list_head  kib_connd_conns;      /* connections to progress */
         struct list_head  kib_connd_peers;      /* peers waiting for a connection */
-        wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
-        unsigned long     kib_connd_waketime;   /* when connd will wake */
+        wait_queue_head_t kib_connd_waitq;      /* connection daemon sleep here */
         spinlock_t        kib_connd_lock;       /* serialise */
 
         wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
-        struct list_head  kib_sched_txq;        /* tx requiring attention */
-        struct list_head  kib_sched_rxq;        /* rx requiring attention */
         spinlock_t        kib_sched_lock;       /* serialise */
 
         struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
@@ -211,15 +212,15 @@ typedef struct
         IB_HANDLE         kib_pd;               /* protection domain */
         IB_HANDLE         kib_sd;               /* SD handle */
         IB_HANDLE         kib_cq;               /* completion queue */
-        kib_md_t          kib_md;               /* full-mem registration */
-
-        void             *kib_listen_handle;    /* where I listen for connections */
+        kib_md_t          kib_whole_mem;        /* whole-mem registration */
 
         IBT_INTERFACE_UNION kib_interfaces;     /* The Infinicon IBT interface */
 
+        int                 kib_hca_idx;        /* my HCA number */
         uint64              kib_hca_guids[8];   /* all the HCA guids */
         IB_CA_ATTRIBUTES    kib_hca_attrs;      /* where to get HCA attrs */
-        FABRIC_OPERATION_DATA kib_fabopdata;    /* (un)advertise service record */
+
+        COMMAND_CONTROL_PARAMETERS kib_sdretry; /* control SD query retries */
 } kib_data_t;
 
 #define IBNAL_INIT_NOTHING         0
@@ -227,14 +228,12 @@ typedef struct
 #define IBNAL_INIT_LIB             2
 #define IBNAL_INIT_HCA             3
 #define IBNAL_INIT_PORTATTRS       4
-#define IBNAL_INIT_PORT            5
-#define IBNAL_INIT_SD              6
-#define IBNAL_INIT_PD              7
-#define IBNAL_INIT_FMR             8
-#define IBNAL_INIT_MR              9
-#define IBNAL_INIT_TXD             10
-#define IBNAL_INIT_CQ              11
-#define IBNAL_INIT_ALL             12
+#define IBNAL_INIT_SD              5
+#define IBNAL_INIT_PD              6
+#define IBNAL_INIT_MD              7
+#define IBNAL_INIT_TXD             8
+#define IBNAL_INIT_CQ              9
+#define IBNAL_INIT_ALL             10
 
 /************************************************************************
  * Wire message structs.
@@ -243,13 +242,12 @@ typedef struct
  * private data and SM service info), is LE on the wire.
  */
 
-/* also kib_md_t above */
-
-typedef struct
+typedef struct kib_connparams
 {
-        __u32                 rd_nob;           /* # of bytes */
-        __u64                 rd_addr;          /* remote io vaddr */
-} WIRE_ATTR kib_rdma_desc_t;
+        __u32             ibcp_queue_depth;
+        __u32             ibcp_max_msg_size;
+        __u32             ibcp_max_frags;
+} WIRE_ATTR kib_connparams_t;
 
 typedef struct
 {
@@ -257,21 +255,47 @@ typedef struct
         char              ibim_payload[0];      /* piggy-backed payload */
 } WIRE_ATTR kib_immediate_msg_t;
 
-/* these arrays serve two purposes during rdma.  they are built on the passive
- * side and sent to the active side as remote arguments.  On the active side
- * the descs are used as a data structure on the way to local gather items.
- * the different roles result in split local/remote meaning of desc->rd_key */
+#if IBNAL_USE_FMR
 typedef struct
 {
-        lnet_hdr_t        ibrm_hdr;             /* portals header */
-        __u64             ibrm_cookie;          /* opaque completion cookie */
-        __u32             ibrm_num_descs;       /* how many descs */
-        __u32             rd_key;               /* remote key */
-        kib_rdma_desc_t   ibrm_desc[0];         /* where to suck/blow */
-} WIRE_ATTR kib_rdma_msg_t;
+       __u64             rd_addr;              /* IO VMA address */
+       __u32             rd_nob;               /* # of bytes */
+       __u32             rd_key;               /* remote key */
+} WIRE_ATTR kib_rdma_desc_t;
+#else
+typedef struct
+{
+        __u32             rf_nob;               /* # of bytes */
+        __u64             rf_addr;              /* remote io vaddr */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct
+{
+        __u32             rd_key;               /* local/remote key */
+        __u32             rd_nfrag;             /* # fragments */
+        kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+#endif
 
-#define kib_rdma_msg_len(num_descs) \
-        offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs])
+typedef struct
+{
+        lnet_hdr_t        ibprm_hdr;            /* LNET header */
+        __u64             ibprm_cookie;         /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct
+{
+        __u64             ibpam_src_cookie;     /* reflected completion cookie */
+        __u64             ibpam_dst_cookie;     /* opaque completion cookie */
+        kib_rdma_desc_t   ibpam_rd;             /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct
+{
+        lnet_hdr_t        ibgm_hdr;             /* LNET header */
+        __u64             ibgm_cookie;          /* opaque completion cookie */
+        kib_rdma_desc_t   ibgm_rd;              /* sender's sink buffer */
+} WIRE_ATTR kib_get_msg_t;
 
 typedef struct
 {
@@ -281,17 +305,26 @@ typedef struct
 
 typedef struct
 {
-        __u32              ibm_magic;           /* I'm an openibnal message */
-        __u16              ibm_version;         /* this is my version number */
-        __u8               ibm_type;            /* msg type */
-        __u8               ibm_credits;         /* returned credits */
-#if IBNAL_CKSUM
-        __u32              ibm_nob;
-        __u32              ibm_cksum;
-#endif
+        /* First 2 fields fixed FOR ALL TIME */
+        __u32             ibm_magic;            /* I'm an openibnal message */
+        __u16             ibm_version;          /* this is my version number */
+
+        __u8              ibm_type;             /* msg type */
+        __u8              ibm_credits;          /* returned credits */
+        __u32             ibm_nob;              /* # bytes in whole message */
+        __u32             ibm_cksum;            /* checksum (0 == no checksum) */
+        __u64             ibm_srcnid;           /* sender's NID */
+        __u64             ibm_srcstamp;         /* sender's incarnation */
+        __u64             ibm_dstnid;           /* destination's NID */
+        __u64             ibm_dststamp;         /* destination's incarnation */
+        __u64             ibm_seq;              /* sequence number */
+
         union {
+                kib_connparams_t      connparams;
                 kib_immediate_msg_t   immediate;
-                kib_rdma_msg_t        rdma;
+                kib_putreq_msg_t      putreq;
+                kib_putack_msg_t      putack;
+                kib_get_msg_t         get;
                 kib_completion_msg_t  completion;
         } WIRE_ATTR ibm_u;
 } WIRE_ATTR kib_msg_t;
@@ -299,12 +332,16 @@ typedef struct
 #define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
 #define IBNAL_MSG_VERSION              1        /* current protocol version */
 
+#define IBNAL_MSG_CONNREQ           0xc0        /* connection request */
+#define IBNAL_MSG_CONNACK           0xc1        /* connection acknowledge */
 #define IBNAL_MSG_NOOP              0xd0        /* nothing (just credits) */
-#define IBNAL_MSG_IMMEDIATE         0xd1        /* portals hdr + payload */
-#define IBNAL_MSG_PUT_RDMA          0xd2        /* portals PUT hdr + source rdma desc */
-#define IBNAL_MSG_PUT_DONE          0xd3        /* signal PUT rdma completion */
-#define IBNAL_MSG_GET_RDMA          0xd4        /* portals GET hdr + sink rdma desc */
-#define IBNAL_MSG_GET_DONE          0xd5        /* signal GET rdma completion */
+#define IBNAL_MSG_IMMEDIATE         0xd1        /* immediate */
+#define IBNAL_MSG_PUT_REQ           0xd2        /* putreq (src->sink) */
+#define IBNAL_MSG_PUT_NAK           0xd3        /* completion (sink->src) */
+#define IBNAL_MSG_PUT_ACK           0xd4        /* putack (sink->src) */
+#define IBNAL_MSG_PUT_DONE          0xd5        /* completion (src->sink) */
+#define IBNAL_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
+#define IBNAL_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
 
 /***********************************************************************/
 
@@ -312,9 +349,9 @@ typedef struct kib_rx                           /* receive message */
 {
         struct list_head          rx_list;      /* queue for attention */
         struct kib_conn          *rx_conn;      /* owning conn */
-        int                       rx_rdma;      /* RDMA completion posted? */
+        int                       rx_responded; /* responded to peer? */
         int                       rx_nob;       /* # bytes received (-1 while posted) */
-        __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+        __u64                     rx_hca_msg;   /* pre-mapped buffer (hca vaddr) */
         kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
         IB_WORK_REQ               rx_wrq;
         IB_LOCAL_DATASEGMENT      rx_gl;        /* and its memory */
@@ -326,93 +363,68 @@ typedef struct kib_tx                           /* transmit message */
         struct kib_conn          *tx_conn;      /* owning conn */
         int                       tx_mapped;    /* mapped for RDMA? */
         int                       tx_sending;   /* # tx callbacks outstanding */
+        int                       tx_queued;    /* queued for sending */
+        int                       tx_waiting;   /* waiting for peer */
         int                       tx_status;    /* completion status */
         unsigned long             tx_deadline;  /* completion deadline */
-        int                       tx_passive_rdma; /* peer sucks/blows */
-        int                       tx_passive_rdma_wait; /* waiting for peer to complete */
-        __u64                     tx_passive_rdma_cookie; /* completion cookie */
+        __u64                     tx_cookie;    /* completion cookie */
         lnet_msg_t               *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
-        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
-        __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
         kib_msg_t                *tx_msg;       /* pre-mapped buffer (host vaddr) */
-        int                       tx_nsp;       /* # send work items */
-        IB_WORK_REQ               tx_wrq[IBNAL_TX_MAX_SG];    /* send work items... */
-        IB_LOCAL_DATASEGMENT      tx_gl[IBNAL_TX_MAX_SG];     /* ...and their memory */
+        __u64                     tx_hca_msg;   /* pre-mapped buffer (HCA vaddr) */
+        int                       tx_nwrq;      /* # send work items */
+#if IBNAL_USE_FMR
+        IB_WORK_REQ               tx_wrq[2];    /* send work items... */
+        IB_LOCAL_DATASEGMENT      tx_gl[2];     /* ...and their memory */
+        kib_rdma_desc_t           tx_rd[1];     /* rdma descriptor */
+        kib_md_t                  tx_md;        /* mapping */
+        __u64                    *tx_pages;     /* page phys addrs */
+#else
+        IB_WORK_REQ              *tx_wrq;       /* send work items... */
+        IB_LOCAL_DATASEGMENT     *tx_gl;        /* ...and their memory */
+        kib_rdma_desc_t          *tx_rd;        /* rdma descriptor (src buffers) */
+#endif
 } kib_tx_t;
 
-#define KIB_TX_UNMAPPED       0
-#define KIB_TX_MAPPED         1
-#define KIB_TX_MAPPED_FMR     2
-
-typedef struct kib_wire_connreq
-{
-        __u32        wcr_magic;                 /* I'm an openibnal connreq */
-        __u16        wcr_version;               /* this is my version number */
-        __u16        wcr_queue_depth;           /* this is my receive queue size */
-        __u64        wcr_nid;                   /* peer's NID */
-        __u64        wcr_incarnation;           /* peer's incarnation */
-} kib_wire_connreq_t;
-
-typedef struct kib_gid
-{
-        __u64   hi, lo;
-} kib_gid_t;
-
-typedef struct kib_connreq
+typedef struct
 {
-        /* connection-in-progress */
-        struct kib_conn                    *cr_conn;
-        kib_wire_connreq_t                  cr_wcr;
-        __u64                               cr_tid;
-        IB_SERVICE_RECORD                   cr_service;
-        kib_gid_t                           cr_gid;
-        IB_PATH_RECORD                      cr_path;
-        CM_REQUEST_INFO                     cr_cmreq;
-        CM_CONN_INFO                        cr_discarded;
-} kib_connreq_t;
+        /* scratchpad during connection establishment */
+        IB_QP_ATTRIBUTES_QUERY cv_qpattrs;
+        QUERY                  cv_query;
+        IB_SERVICE_RECORD      cv_svcrec;
+        IB_PATH_RECORD         cv_path;
+        CM_CONN_INFO           cv_cmci;
+} kib_connvars_t;
 
 typedef struct kib_conn
 {
         struct kib_peer    *ibc_peer;           /* owning peer */
         struct list_head    ibc_list;           /* stash on peer's conn list */
         __u64               ibc_incarnation;    /* which instance of the peer */
+        __u64               ibc_txseq;          /* tx sequence number */
+        __u64               ibc_rxseq;          /* rx sequence number */
         atomic_t            ibc_refcount;       /* # users */
         int                 ibc_state;          /* what's happening */
         atomic_t            ibc_nob;            /* # bytes buffered */
         int                 ibc_nsends_posted;  /* # uncompleted sends */
         int                 ibc_credits;        /* # credits I have */
         int                 ibc_outstanding_credits; /* # credits to return */
-        int                 ibc_rcvd_disconnect;/* received discon request */
-        int                 ibc_sent_disconnect;/* sent discon request */
+        struct list_head    ibc_early_rxs;      /* rxs completed before ESTABLISHED */
         struct list_head    ibc_tx_queue;       /* send queue */
         struct list_head    ibc_active_txs;     /* active tx awaiting completion */
         spinlock_t          ibc_lock;           /* serialise */
         kib_rx_t           *ibc_rxs;            /* the rx descs */
         kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
         IB_HANDLE           ibc_qp;             /* queue pair */
-        IB_HANDLE           ibc_cep;            /* connection ID? */
-        IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs;    /* QP attrs */
-        kib_connreq_t      *ibc_connreq;        /* connection request state */
+        IB_HANDLE           ibc_cep;            /* CM endpoint */
+        kib_connvars_t     *ibc_cvars;          /* connection scratchpad */
 } kib_conn_t;
 
 #define IBNAL_CONN_INIT_NOTHING      0          /* initial state */
 #define IBNAL_CONN_INIT_QP           1          /* ibc_qp set up */
 #define IBNAL_CONN_CONNECTING        2          /* started to connect */
 #define IBNAL_CONN_ESTABLISHED       3          /* connection established */
-#define IBNAL_CONN_SEND_DREQ         4          /* to send disconnect req */
-#define IBNAL_CONN_DREQ              5          /* sent disconnect req */
-#define IBNAL_CONN_DREP              6          /* sent disconnect rep */
-#define IBNAL_CONN_DISCONNECTED      7          /* no more QP or CM traffic */
-
-#define KIB_ASSERT_CONN_STATE(conn, state) do {                         \
-        LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state);  \
-} while (0)
-
-#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do {               \
-        LASSERTF(low <= high, "%d %d\n", low, high);                    \
-        LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \
-                 "%d\n", conn->ibc_state);                              \
-} while (0)
+#define IBNAL_CONN_DISCONNECTING     4          /* to send disconnect req */
+#define IBNAL_CONN_DISCONNECTED      5          /* no more QP or CM traffic */
 
 typedef struct kib_peer
 {
@@ -711,26 +723,47 @@ static inline int wrq_signals_completion(IB_WORK_REQ *wrq)
 /* these are purposely avoiding using local vars so they don't increase
  * stack consumption. */
 
-#define kib_peer_addref(peer) do {                              \
-        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",  \
-                 atomic_read(&peer->ibp_refcount));             \
-        CDEBUG(D_NET, "++peer[%p] -> %s (%d)\n",                \
-               peer, libcfs_nid2str(peer->ibp_nid),             \
-               atomic_read (&peer->ibp_refcount));              \
-        atomic_inc(&peer->ibp_refcount);                        \
+#define kibnal_conn_addref(conn)                                \
+do {                                                            \
+        CDEBUG(D_NET, "conn[%p] (%d)++\n",                      \
+               (conn), atomic_read(&(conn)->ibc_refcount));     \
+        LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);        \
+        atomic_inc(&(conn)->ibc_refcount);                      \
+} while (0)
+
+#define kibnal_conn_decref(conn)                                              \
+do {                                                                          \
+        unsigned long   flags;                                                \
+                                                                              \
+        CDEBUG(D_NET, "conn[%p] (%d)--\n",                                    \
+               (conn), atomic_read(&(conn)->ibc_refcount));                   \
+        LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);                      \
+        if (atomic_dec_and_test(&(conn)->ibc_refcount)) {                     \
+                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);        \
+                list_add_tail(&(conn)->ibc_list,                              \
+                              &kibnal_data.kib_connd_zombies);                \
+                wake_up(&kibnal_data.kib_connd_waitq);                        \
+                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);   \
+        }                                                                     \
+} while (0)
+
+#define kibnal_peer_addref(peer)                                \
+do {                                                            \
+        CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
+               (peer), libcfs_nid2str((peer)->ibp_nid),         \
+               atomic_read (&(peer)->ibp_refcount));            \
+        LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
+        atomic_inc(&(peer)->ibp_refcount);                      \
 } while (0)
 
-#define kib_peer_decref(peer) do {                              \
-        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",  \
-                 atomic_read(&peer->ibp_refcount));             \
-        CDEBUG(D_NET, "--peer[%p] -> %s (%d)\n",                \
-               peer, libcfs_nid2str(peer->ibp_nid),             \
-               atomic_read (&peer->ibp_refcount));              \
-        if (atomic_dec_and_test (&peer->ibp_refcount)) {        \
-                CDEBUG (D_NET, "destroying peer %s %p\n",       \
-                        libcfs_nid2str(peer->ibp_nid), peer);   \
-                kibnal_destroy_peer (peer);                     \
-        }                                                       \
+#define kibnal_peer_decref(peer)                                \
+do {                                                            \
+        CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
+               (peer), libcfs_nid2str((peer)->ibp_nid),         \
+               atomic_read (&(peer)->ibp_refcount));            \
+        LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
+        if (atomic_dec_and_test(&(peer)->ibp_refcount))         \
+                kibnal_destroy_peer(peer);                      \
 } while (0)
 
 /******************************************************************************/
@@ -755,11 +788,18 @@ kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
 {
         /* CAVEAT EMPTOR: tx takes caller's ref on conn */
 
-        LASSERT (tx->tx_nsp > 0);               /* work items set up */
-        LASSERT (tx->tx_conn == NULL);          /* only set here */
+        LASSERT (tx->tx_nwrq > 0);              /* work items set up */
+        LASSERT (!tx->tx_queued);               /* not queued for sending already */
 
-        tx->tx_conn = conn;
-        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
+        if (tx->tx_conn == NULL) {
+                kibnal_conn_addref(conn);
+                tx->tx_conn = conn;
+        } else {
+                LASSERT (tx->tx_conn == conn);
+                LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE);
+        }
+        tx->tx_queued = 1;
+        tx->tx_deadline = jiffies + (*kibnal_tunables.kib_timeout * HZ);
         list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
 }
 
@@ -780,13 +820,14 @@ kibnal_service_nid_field(IB_SERVICE_RECORD *srv)
         return (__u64 *)srv->ServiceData8;
 }
 
-
 static inline void
 kibnal_set_service_keys(IB_SERVICE_RECORD *srv, lnet_nid_t nid)
 {
-        LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName));
+        char *svc_name = *kibnal_tunables.kib_service_name;
+
+        LASSERT (strlen(svc_name) < sizeof(srv->ServiceName));
         memset (srv->ServiceName, 0, sizeof(srv->ServiceName));
-        strcpy (srv->ServiceName, IBNAL_SERVICE_NAME);
+        strcpy (srv->ServiceName, svc_name);
 
         *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
 }
@@ -813,88 +854,120 @@ kibnal_show_rdma_attr (kib_conn_t *conn)
 }
 #endif
 
-#if CONFIG_X86
 static inline __u64
 kibnal_page2phys (struct page *p)
 {
-        __u64 page_number = p - mem_map;
-
-        return (page_number << PAGE_SHIFT);
+        return page_to_phys(p);
 }
-#else
-# error "no page->phys"
-#endif
 
-/* CAVEAT EMPTOR:
- * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
- * of the work request id as a flag to determine if the completion is for a
- * transmit or a receive.  It seems that that the CQ entry's 'op' field
- * isn't always set correctly on completions that occur after QP teardown. */
+
+/* CAVEAT EMPTOR: We rely on tx/rx descriptor alignment to allow us to use the
+ * lowest 2 bits of the work request id to stash the work item type (the op
+ * field is not valid when the wc completes in error). */
+
+#define IBNAL_WID_TX    0
+#define IBNAL_WID_RX    1
+#define IBNAL_WID_RDMA  2
+#define IBNAL_WID_MASK  3UL
 
 static inline __u64
-kibnal_ptr2wreqid (void *ptr, int isrx)
+kibnal_ptr2wreqid (void *ptr, int type)
 {
         unsigned long lptr = (unsigned long)ptr;
 
-        LASSERT ((lptr & 1) == 0);
-        return (__u64)(lptr | (isrx ? 1 : 0));
+        LASSERT ((lptr & IBNAL_WID_MASK) == 0);
+        LASSERT ((type & ~IBNAL_WID_MASK) == 0);
+        return (__u64)(lptr | type);
 }
 
 static inline void *
 kibnal_wreqid2ptr (__u64 wreqid)
 {
-        return (void *)(((unsigned long)wreqid) & ~1UL);
+        return (void *)(((unsigned long)wreqid) & ~IBNAL_WID_MASK);
 }
 
 static inline int
-kibnal_wreqid_is_rx (__u64 wreqid)
+kibnal_wreqid2type (__u64 wreqid)
+{
+        return (wreqid & IBNAL_WID_MASK);
+}
+
+static inline void
+kibnal_set_conn_state (kib_conn_t *conn, int state)
 {
-        return (wreqid & 1) != 0;
+        conn->ibc_state = state;
+        mb();
 }
 
+#if IBNAL_USE_FMR
+
 static inline int
-kibnal_whole_mem(void)
-{
-        return kibnal_data.kib_md.md_handle != NULL;
-}
-
-extern int kibnal_startup (lnet_ni_t *ni);
-extern void kibnal_shutdown (lnet_ni_t *ni);
-extern int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
-int kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
-extern int kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg,
-                        int delayed, unsigned int niov, 
-                        struct iovec *iov, lnet_kiov_t *kiov,
-                        unsigned int offset, unsigned int mlen, unsigned int rlen);
-
-extern kib_peer_t *kibnal_create_peer (lnet_nid_t nid);
-extern void kibnal_destroy_peer (kib_peer_t *peer);
-extern int kibnal_del_peer (lnet_nid_t nid);
-extern kib_peer_t *kibnal_find_peer_locked (lnet_nid_t nid);
-extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
-extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer,
-                                              __u64 incarnation);
-extern kib_conn_t *kibnal_create_conn (void);
-extern void kibnal_put_conn (kib_conn_t *conn);
-extern void kibnal_destroy_conn (kib_conn_t *conn);
-void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg);
+kibnal_rd_size (kib_rdma_desc_t *rd) 
+{
+        return rd->rd_nob;
+}
 
-extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
-extern void kibnal_free_pages (kib_pages_t *p);
-
-extern void kibnal_check_sends (kib_conn_t *conn);
-extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
-extern void kibnal_destroy_conn (kib_conn_t *conn);
-extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
-extern int  kibnal_scheduler(void *arg);
-extern int  kibnal_connd (void *arg);
-extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
-extern void kibnal_close_conn (kib_conn_t *conn, int why);
-extern void kibnal_start_active_rdma (int type, int status,
-                                      kib_rx_t *rx, lnet_msg_t *lntmsg,
-                                      unsigned int niov,
-                                      struct iovec *iov, lnet_kiov_t *kiov,
-                                      unsigned int offset, unsigned int nob);
-
-void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev);
-void kibnal_ca_callback (void *ca_arg, void *cq_arg);
+#else
+static inline int
+kibnal_rd_size (kib_rdma_desc_t *rd)
+{
+        int   i;
+        int   size;
+        
+        for (i = size = 0; i < rd->rd_nfrag; i++)
+                size += rd->rd_frags[i].rf_nob;
+        
+        return size;
+}
+#endif
+
+int  kibnal_startup (lnet_ni_t *ni);
+void kibnal_shutdown (lnet_ni_t *ni);
+int  kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int  kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg,
+                  int delayed, unsigned int niov,
+                  struct iovec *iov, lnet_kiov_t *kiov,
+                  unsigned int offset, unsigned int mlen, unsigned int rlen);
+void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob);
+void kibnal_pack_msg(kib_msg_t *msg, int credits, lnet_nid_t dstnid,
+                     __u64 dststamp, __u64 seq);
+void kibnal_pack_connmsg(kib_msg_t *msg, int nob, int type,
+                         lnet_nid_t dstnid, __u64 dststamp);
+int  kibnal_unpack_msg(kib_msg_t *msg, int nob);
+IB_HANDLE kibnal_create_cep(lnet_nid_t nid);
+int  kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid);
+void kibnal_destroy_peer (kib_peer_t *peer);
+kib_peer_t *kibnal_find_peer_locked (lnet_nid_t nid);
+int  kibnal_del_peer (lnet_nid_t nid);
+void kibnal_unlink_peer_locked (kib_peer_t *peer);
+int  kibnal_add_persistent_peer (lnet_nid_t nid);
+int  kibnal_close_stale_conns_locked (kib_peer_t *peer,
+                                      __u64 incarnation);
+int  kibnal_conn_rts(kib_conn_t *conn,
+                     __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn);
+kib_conn_t *kibnal_create_conn (lnet_nid_t nid);
+void kibnal_destroy_conn (kib_conn_t *conn);
+void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg);
+int  kibnal_alloc_pages (kib_pages_t **pp, int npages);
+void kibnal_free_pages (kib_pages_t *p);
+void kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+int  kibnal_post_receives (kib_conn_t *conn);
+int  kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
+                       kib_rdma_desc_t *dstrd, __u64 dstcookie);
+void kibnal_check_sends (kib_conn_t *conn);
+void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
+int  kibnal_scheduler(void *arg);
+int  kibnal_connd (void *arg);
+void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+void kibnal_close_conn (kib_conn_t *conn, int why);
+void kibnal_start_active_rdma (int type, int status,
+                               kib_rx_t *rx, lnet_msg_t *lntmsg,
+                               unsigned int niov,
+                               struct iovec *iov, lnet_kiov_t *kiov,
+                               unsigned int offset, unsigned int nob);
+void kibnal_hca_async_callback (void *hca_arg, IB_EVENT_RECORD *ev);
+void kibnal_hca_callback (void *hca_arg, void *cq_arg);
+int  kibnal_tunables_init (void);
+void kibnal_tunables_fini (void);
index 4ed6bd3..472840a 100644 (file)
 
 #include "iiblnd.h"
 
-/*
- *  LIB functions follow
- *
- */
-static void
-kibnal_schedule_tx_done (kib_tx_t *tx)
+void
+hexdump(char *string, void *ptr, int len)
 {
-        unsigned long flags;
+        unsigned char *c = ptr;
+        int i;
 
-        spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
+        return;
 
-        list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
-        wake_up (&kibnal_data.kib_sched_waitq);
+        if (len < 0 || len > 2048)  {
+                printk("XXX what the hell? %d\n",len);
+                return;
+        }
 
-        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+        printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+
+        for (i = 0; i < len;) {
+                printk("%02x",*(c++));
+                i++;
+                if (!(i & 15)) {
+                        printk("\n");
+                } else if (!(i&1)) {
+                        printk(" ");
+                }
+        }
+
+        if(len & 15) {
+                printk("\n");
+        }
 }
 
-static void
+void
 kibnal_tx_done (kib_tx_t *tx)
 {
-        unsigned long    flags;
-        int              i;
-        FSTATUS          frc;
+        int        rc = tx->tx_status;
+        int        i;
+        FSTATUS    frc;
 
-        LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
-        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
+        LASSERT (!in_interrupt());
+        LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
+        LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
+        LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
 
-        switch (tx->tx_mapped) {
-        default:
-                LBUG();
-
-        case KIB_TX_UNMAPPED:
-                break;
-
-        case KIB_TX_MAPPED:
-                if (in_interrupt()) {
-                        /* can't deregister memory in IRQ context... */
-                        kibnal_schedule_tx_done(tx);
-                        return;
-                }
-                frc = iibt_deregister_memory(tx->tx_md.md_handle);
-                LASSERT (frc == FSUCCESS);
-                tx->tx_mapped = KIB_TX_UNMAPPED;
-                break;
-
-#if IBNAL_FMR
-        case KIB_TX_MAPPED_FMR:
-                if (in_interrupt() && tx->tx_status != 0) {
-                        /* can't flush FMRs in IRQ context... */
-                        kibnal_schedule_tx_done(tx);
-                        return;
-                }              
-
-                rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
-                LASSERT (rc == 0);
-
-                if (tx->tx_status != 0)
-                        ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
-                tx->tx_mapped = KIB_TX_UNMAPPED;
-                break;
+#if IBNAL_USE_FMR
+        /* Handle unmapping if required */
 #endif
-        }
-
         for (i = 0; i < 2; i++) {
                 /* tx may have up to 2 ptlmsgs to finalise */
                 if (tx->tx_lntmsg[i] == NULL)
                         continue;
 
-                lnet_finalize (kibnal_data.kib_ni, tx->tx_lntmsg[i],
-                               tx->tx_status);
+                lnet_finalize (kibnal_data.kib_ni, tx->tx_lntmsg[i], rc);
                 tx->tx_lntmsg[i] = NULL;
         }
         
         if (tx->tx_conn != NULL) {
-                kibnal_put_conn (tx->tx_conn);
+                kibnal_conn_decref(tx->tx_conn);
                 tx->tx_conn = NULL;
         }
 
-        tx->tx_nsp = 0;
-        tx->tx_passive_rdma = 0;
+        tx->tx_nwrq = 0;
         tx->tx_status = 0;
 
-        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+        spin_lock(&kibnal_data.kib_tx_lock);
 
-        list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+        list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
 
-        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+        spin_unlock(&kibnal_data.kib_tx_lock);
 }
 
-static kib_tx_t *
+kib_tx_t *
 kibnal_get_idle_tx (void) 
 {
-        unsigned long  flags;
-        kib_tx_t      *tx = NULL;
+        kib_tx_t      *tx;
         
-        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+        spin_lock(&kibnal_data.kib_tx_lock);
 
         if (list_empty (&kibnal_data.kib_idle_txs)) {
-                spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+                spin_unlock(&kibnal_data.kib_tx_lock);
                 return NULL;
         }
-        
+
         tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
         list_del (&tx->tx_list);
 
-        /* Allocate a new passive RDMA completion cookie.  It might not be
-         * needed, but we've got a lock right now and we're unlikely to
+        /* Allocate a new completion cookie.  It might not be needed,
+         * but we've got a lock right now and we're unlikely to
          * wrap... */
-        tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
+        tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
 
-        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+        spin_unlock(&kibnal_data.kib_tx_lock);
 
-        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
-        LASSERT (tx->tx_nsp == 0);
+        LASSERT (tx->tx_nwrq == 0);
+        LASSERT (!tx->tx_queued);
         LASSERT (tx->tx_sending == 0);
+        LASSERT (!tx->tx_waiting);
         LASSERT (tx->tx_status == 0);
         LASSERT (tx->tx_conn == NULL);
-        LASSERT (!tx->tx_passive_rdma);
-        LASSERT (!tx->tx_passive_rdma_wait);
         LASSERT (tx->tx_lntmsg[0] == NULL);
         LASSERT (tx->tx_lntmsg[1] == NULL);
         
-        RETURN(tx);
+        return tx;
 }
 
-static void
-kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+int
+kibnal_post_rx (kib_rx_t *rx, int credit)
 {
-        struct list_head *ttmp;
-        unsigned long     flags;
-        int               idle;
-
-        spin_lock_irqsave (&conn->ibc_lock, flags);
+        kib_conn_t   *conn = rx->rx_conn;
+        int           rc = 0;
+        unsigned long flags;
+        FSTATUS       frc;
 
-        list_for_each (ttmp, &conn->ibc_active_txs) {
-                kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
+        LASSERT (!in_interrupt());
+        
+        rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
+                .Address = rx->rx_hca_msg,
+                .Lkey    = kibnal_data.kib_whole_mem.md_lkey,
+                .Length  = IBNAL_MSG_SIZE,
+        };
 
-                LASSERT (tx->tx_passive_rdma ||
-                         !tx->tx_passive_rdma_wait);
+        rx->rx_wrq = (IB_WORK_REQ) {
+                .WorkReqId     = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
+                .MessageLen    = IBNAL_MSG_SIZE,
+                .DSList        = &rx->rx_gl,
+                .DSListDepth   = 1,
+                .Operation     = WROpRecv,
+        };
 
-                LASSERT (tx->tx_passive_rdma_wait ||
-                         tx->tx_sending != 0);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
+        LASSERT (rx->rx_nob >= 0);              /* not posted */
 
-                if (!tx->tx_passive_rdma_wait ||
-                    tx->tx_passive_rdma_cookie != cookie)
-                        continue;
+        CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", 
+               rx->rx_wrq.DSList->Length,
+               rx->rx_wrq.DSList->Lkey,
+               rx->rx_wrq.DSList->Address);
 
-                CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
+                /* No more posts for this rx; so lose its ref */
+                kibnal_conn_decref(conn);
+                return 0;
+        }
+        
+        rx->rx_nob = -1;                        /* flag posted */
+        mb();
 
-                /* XXX Set mlength of REPLY here */
+        frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
+        if (frc == FSUCCESS) {
+                if (credit) {
+                        spin_lock(&conn->ibc_lock);
+                        conn->ibc_outstanding_credits++;
+                        spin_unlock(&conn->ibc_lock);
 
-                tx->tx_status = status;
-                tx->tx_passive_rdma_wait = 0;
-                idle = (tx->tx_sending == 0);
+                        kibnal_check_sends(conn);
+                }
+                return 0;
+        }
+        
+        CERROR ("post rx -> %s failed %d\n", 
+                libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+        rc = -EIO;
+        kibnal_close_conn(rx->rx_conn, rc);
+        /* No more posts for this rx; so lose its ref */
+        kibnal_conn_decref(conn);
+        return rc;
+}
 
-                if (idle)
-                        list_del (&tx->tx_list);
+int
+kibnal_post_receives (kib_conn_t *conn)
+{
+        int    i;
+        int    rc;
 
-                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+        LASSERT (conn->ibc_state == IBNAL_CONN_CONNECTING);
 
-                /* I could be racing with tx callbacks.  It's whoever
-                 * _makes_ tx idle that frees it */
-                if (idle)
-                        kibnal_tx_done (tx);
-                return;
+        for (i = 0; i < IBNAL_RX_MSGS; i++) {
+                /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
+                 * fails (i.e. actual failure or we're disconnecting) */
+                kibnal_conn_addref(conn);
+                rc = kibnal_post_rx (&conn->ibc_rxs[i], 0);
+                if (rc != 0)
+                        return rc;
         }
-                
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
-        CERROR ("Unmatched (late?) RDMA completion "LPX64" from %s\n",
-                cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+        return 0;
 }
 
-static __u32
-kibnal_lkey(kib_pages_t *ibp)
+kib_tx_t *
+kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
 {
-        if (kibnal_whole_mem())
-                return kibnal_data.kib_md.md_lkey;
+        struct list_head   *tmp;
+        
+        list_for_each(tmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+                
+                LASSERT (!tx->tx_queued);
+                LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
+
+                if (tx->tx_cookie != cookie)
+                        continue;
 
-        return ibp->ibp_lkey;
+                if (tx->tx_waiting &&
+                    tx->tx_msg->ibm_type == txtype)
+                        return tx;
+
+                CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+                      tx->tx_waiting ? "" : "NOT ",
+                      tx->tx_msg->ibm_type, txtype);
+        }
+        return NULL;
 }
 
-static void
-kibnal_post_rx (kib_rx_t *rx, int do_credits)
+void
+kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
 {
-        kib_conn_t   *conn = rx->rx_conn;
-        int           rc = 0;
-        unsigned long flags;
-        FSTATUS       frc;
-        ENTRY;
+        kib_tx_t    *tx;
+        int          idle;
 
-        rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
-                .Address = rx->rx_vaddr,
-                .Length  = IBNAL_MSG_SIZE,
-                .Lkey    = kibnal_lkey(conn->ibc_rx_pages),
-        };
+        spin_lock(&conn->ibc_lock);
 
-        rx->rx_wrq = (IB_WORK_REQ) {
-                .Operation              = WROpRecv,
-                .DSListDepth            = 1,
-                .MessageLen             = IBNAL_MSG_SIZE,
-                .WorkReqId              = kibnal_ptr2wreqid(rx, 1),
-                .DSList                 = &rx->rx_gl,
-        };
-
-        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
-                                    IBNAL_CONN_DREP);
-        LASSERT (rx->rx_nob >= 0);
-        rx->rx_nob = -1;                        /* flag posted */
-        mb();
+        tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
+        if (tx == NULL) {
+                spin_unlock(&conn->ibc_lock);
 
-        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
-                rc = -ECONNABORTED;
-        else {
-                frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
-                if (frc != FSUCCESS) {
-                        CDEBUG(D_NET, "post failed %d\n", frc);
-                        rc = -EINVAL;
-                }
-                CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+                CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+                      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kibnal_close_conn (conn, -EPROTO);
+                return;
         }
 
-        if (rc == 0) {
-                if (do_credits) {
-                        spin_lock_irqsave(&conn->ibc_lock, flags);
-                        conn->ibc_outstanding_credits++;
-                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+        if (tx->tx_status == 0) {               /* success so far */
+                if (status < 0) {               /* failed? */
+                        tx->tx_status = status;
+                } else if (txtype == IBNAL_MSG_GET_REQ) { 
+                        /* XXX layering violation: set REPLY data length */
+                        LASSERT (tx->tx_lntmsg[1] != NULL);
+                        LASSERT (tx->tx_lntmsg[1]->msg_ev.type == 
+                                 LNET_EVENT_REPLY);
 
-                        kibnal_check_sends(conn);
+                        tx->tx_lntmsg[1]->msg_ev.mlength = status;
                 }
-                EXIT;
-                return;
         }
+        
+        tx->tx_waiting = 0;
 
-        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
-                CERROR ("Error posting receive -> %s: %d\n",
-                        libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
-                kibnal_close_conn (rx->rx_conn, rc);
-        } else {
-                CDEBUG (D_NET, "Error posting receive -> %s: %d\n",
-                        libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
-        }
+        idle = !tx->tx_queued && (tx->tx_sending == 0);
+        if (idle)
+                list_del(&tx->tx_list);
 
-        /* Drop rx's ref */
-        kibnal_put_conn (conn);
-        EXIT;
+        spin_unlock(&conn->ibc_lock);
+        
+        if (idle)
+                kibnal_tx_done(tx);
 }
 
-#if IBNAL_CKSUM
-static inline __u32 kibnal_cksum (void *ptr, int nob)
+void
+kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) 
 {
-        char  *c  = ptr;
-        __u32  sum = 0;
-
-        while (nob-- > 0)
-                sum = ((sum << 1) | (sum >> 31)) + *c++;
+        kib_tx_t    *tx = kibnal_get_idle_tx();
+        
+        if (tx == NULL) {
+                CERROR("Can't get tx for completion %x for %s\n",
+                       type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return;
+        }
+        
+        tx->tx_msg->ibm_u.completion.ibcm_status = status;
+        tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+        kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
         
-        return (sum);
+        kibnal_queue_tx(tx, conn);
 }
-#endif
 
-static void hexdump(char *string, void *ptr, int len)
+void
+kibnal_handle_rx (kib_rx_t *rx)
 {
-        unsigned char *c = ptr;
-        int i;
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+        int           credits = msg->ibm_credits;
+        kib_tx_t     *tx;
+        int           rc = 0;
+        int           repost = 1;
+        int           rc2;
 
-        return;
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 
-        if (len < 0 || len > 2048)  {
-                printk("XXX what the hell? %d\n",len);
-                return;
+        CDEBUG (D_NET, "Received %x[%d] from %s\n",
+                msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+        
+        if (credits != 0) {
+                /* Have I received credits that will let me send? */
+                spin_lock(&conn->ibc_lock);
+                conn->ibc_credits += credits;
+                spin_unlock(&conn->ibc_lock);
+
+                kibnal_check_sends(conn);
         }
 
-        printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+        /* clear flag so GET_REQ can see if it caused a REPLY */
+        rx->rx_responded = 0;
 
-        for (i = 0; i < len;) {
-                printk("%02x",*(c++));
-                i++;
-                if (!(i & 15)) {
-                        printk("\n");
-                } else if (!(i&1)) {
-                        printk(" ");
+        switch (msg->ibm_type) {
+        default:
+                CERROR("Bad IBNAL message type %x from %s\n",
+                       msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                rc = -EPROTO;
+                break;
+
+        case IBNAL_MSG_NOOP:
+                break;
+
+        case IBNAL_MSG_IMMEDIATE:
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
+                                msg->ibm_srcnid, rx);
+                repost = rc < 0;                /* repost on error */
+                break;
+                
+        case IBNAL_MSG_PUT_REQ:
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr,
+                                msg->ibm_srcnid, rx);
+                repost = rc < 0;                /* repost on error */
+                break;
+
+        case IBNAL_MSG_PUT_NAK:
+                CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, 
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+
+        case IBNAL_MSG_PUT_ACK:
+                spin_lock(&conn->ibc_lock);
+                tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
+                                                   msg->ibm_u.putack.ibpam_src_cookie);
+                if (tx != NULL)
+                        list_del(&tx->tx_list);
+                spin_unlock(&conn->ibc_lock);
+
+                if (tx == NULL) {
+                        CERROR("Unmatched PUT_ACK from %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        rc = -EPROTO;
+                        break;
                 }
-        }
 
-        if(len & 15) {
-                printk("\n");
+                LASSERT (tx->tx_waiting);
+                /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+                 * (a) I can overwrite tx_msg since my peer has received it!
+                 * (b) tx_waiting set tells tx_complete() it's not done. */
+
+                tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
+
+                rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, 
+                                       kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
+                                       &msg->ibm_u.putack.ibpam_rd,
+                                       msg->ibm_u.putack.ibpam_dst_cookie);
+                if (rc2 < 0)
+                        CERROR("Can't setup rdma for PUT to %s: %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+
+                spin_lock(&conn->ibc_lock);
+                if (tx->tx_status == 0 && rc2 < 0)
+                        tx->tx_status = rc2;
+                tx->tx_waiting = 0;             /* clear waiting and queue atomically */
+                kibnal_queue_tx_locked(tx, conn);
+                spin_unlock(&conn->ibc_lock);
+                break;
+                
+        case IBNAL_MSG_PUT_DONE:
+                kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+
+        case IBNAL_MSG_GET_REQ:
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr,
+                                msg->ibm_srcnid, rx);
+                repost = rc < 0;                /* repost on error */
+                break;
+
+        case IBNAL_MSG_GET_DONE:
+                kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
         }
+
+        if (rc < 0)                             /* protocol error */
+                kibnal_close_conn(conn, rc);
+
+        if (repost)
+                kibnal_post_rx(rx, 1);
 }
 
-static void
-kibnal_rx_callback (IB_WORK_COMPLETION *wc)
+void
+kibnal_rx_complete (IB_WORK_COMPLETION *wc, __u64 rxseq)
 {
         kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+        int           nob = wc->Length;
         kib_msg_t    *msg = rx->rx_msg;
         kib_conn_t   *conn = rx->rx_conn;
-        int           nob = wc->Length;
-        const int     base_nob = offsetof(kib_msg_t, ibm_u);
-        int           credits;
-        int           flipped;
         unsigned long flags;
-        __u32         i;
-#if IBNAL_CKSUM
-        __u32         msg_cksum;
-        __u32         computed_cksum;
-#endif
-
-        /* we set the QP to erroring after we've finished disconnecting, 
-         * maybe we should do so sooner. */
-        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, 
-                                    IBNAL_CONN_DISCONNECTED);
+        int           rc;
 
         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
         LASSERT (rx->rx_nob < 0);               /* was posted */
@@ -343,7 +433,7 @@ kibnal_rx_callback (IB_WORK_COMPLETION *wc)
         /* receives complete with error in any case after we've started
          * disconnecting */
         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
-                goto failed;
+                goto ignore;
 
         if (wc->Status != WRStatusSuccess) {
                 CERROR("Rx from %s failed: %d\n", 
@@ -351,277 +441,257 @@ kibnal_rx_callback (IB_WORK_COMPLETION *wc)
                 goto failed;
         }
 
-        if (nob < base_nob) {
-                CERROR ("Short rx from %s: %d < expected %d\n",
-                        libcfs_nid2str(conn->ibc_peer->ibp_nid), 
-                        nob, base_nob);
+        rc = kibnal_unpack_msg(msg, nob);
+        if (rc != 0) {
+                CERROR ("Error %d unpacking rx from %s\n",
+                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 goto failed;
         }
 
-        rx->rx_nob = nob;
+        rx->rx_nob = nob;                       /* Now I know nob > 0 */
         mb();
 
-        hexdump("rx", rx->rx_msg, sizeof(kib_msg_t));
-
-        /* Receiver does any byte flipping if necessary... */
-
-        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
-                flipped = 0;
-        } else {
-                if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
-                        CERROR ("Unrecognised magic: %08x from %s\n", 
-                                msg->ibm_magic, 
-                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                        goto failed;
-                }
-                flipped = 1;
-                __swab16s (&msg->ibm_version);
-                LASSERT (sizeof(msg->ibm_type) == 1);
-                LASSERT (sizeof(msg->ibm_credits) == 1);
-        }
-
-        if (msg->ibm_version != IBNAL_MSG_VERSION) {
-                CERROR ("Incompatible msg version %d (%d expected)\n",
-                        msg->ibm_version, IBNAL_MSG_VERSION);
+        if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+            msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid ||
+            msg->ibm_srcstamp != conn->ibc_incarnation ||
+            msg->ibm_dststamp != kibnal_data.kib_incarnation) {
+                CERROR ("Stale rx from %s\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 goto failed;
         }
 
-#if IBNAL_CKSUM
-        if (nob != msg->ibm_nob) {
-                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
+        if (msg->ibm_seq != rxseq) {
+                CERROR ("Out-of-sequence rx from %s"
+                        ": got "LPD64" but expected "LPD64"\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                        msg->ibm_seq, rxseq);
                 goto failed;
         }
 
-        msg_cksum = le32_to_cpu(msg->ibm_cksum);
-        msg->ibm_cksum = 0;
-        computed_cksum = kibnal_cksum (msg, nob);
-        
-        if (msg_cksum != computed_cksum) {
-                CERROR ("Checksum failure %d: (%d expected)\n",
-                        computed_cksum, msg_cksum);
-//                goto failed;
-        }
-        CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
-#endif
-
-        /* Have I received credits that will let me send? */
-        credits = msg->ibm_credits;
-        if (credits != 0) {
-                spin_lock_irqsave(&conn->ibc_lock, flags);
-                conn->ibc_credits += credits;
-                spin_unlock_irqrestore(&conn->ibc_lock, flags);
-                
-                kibnal_check_sends(conn);
-        }
-
-        switch (msg->ibm_type) {
-        case IBNAL_MSG_NOOP:
-                kibnal_post_rx (rx, 1);
-                return;
+        /* racing with connection establishment/teardown! */
 
-        case IBNAL_MSG_IMMEDIATE:
-                if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
-                        CERROR ("Short IMMEDIATE from %s: %d\n",
-                                libcfs_nid2str(conn->ibc_peer->ibp_nid), 
-                                nob);
-                        goto failed;
-                }
-                break;
-                
-        case IBNAL_MSG_PUT_RDMA:
-        case IBNAL_MSG_GET_RDMA:
-                if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
-                        CERROR ("Short RDMA msg from %s: %d\n",
-                                libcfs_nid2str(conn->ibc_peer->ibp_nid), nob);
-                        goto failed;
-                }
-                if (flipped) 
-                        __swab32(msg->ibm_u.rdma.ibrm_num_descs);
-
-                CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
-                       msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
-
-                if ((msg->ibm_u.rdma.ibrm_num_descs > LNET_MAX_IOV) ||
-                    (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > 
-                     min(nob, IBNAL_MSG_SIZE))) {
-                        CERROR ("num_descs %d too large\n", 
-                                msg->ibm_u.rdma.ibrm_num_descs);
-                        goto failed;
-                }
-
-                if (flipped) {
-                        __swab32(msg->ibm_u.rdma.rd_key);
-                }
-
-                for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
-                        kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
-
-                        if (flipped) {
-                                __swab32(desc->rd_nob);
-                                __swab64(desc->rd_addr);
-                        }
-
-                        CDEBUG(D_NET, "  key %x, " "addr "LPX64", nob %u\n",
-                               msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob);
-                }
-                break;
-                        
-        case IBNAL_MSG_PUT_DONE:
-        case IBNAL_MSG_GET_DONE:
-                if (nob < base_nob + sizeof (kib_completion_msg_t)) {
-                        CERROR ("Short COMPLETION msg from %s: %d\n",
-                                libcfs_nid2str(conn->ibc_peer->ibp_nid), nob);
-                        goto failed;
+        if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+                /* must check holding global lock to eliminate race */
+                if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+                        list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+                        write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                                flags);
+                        return;
                 }
-                if (flipped)
-                        __swab32s(&msg->ibm_u.completion.ibcm_status);
-                
-                CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
-                       msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
-                       msg->ibm_u.completion.ibcm_status);
-
-                kibnal_complete_passive_rdma (conn, 
-                                              msg->ibm_u.completion.ibcm_cookie,
-                                              msg->ibm_u.completion.ibcm_status);
-                kibnal_post_rx (rx, 1);
-                return;
-                        
-        default:
-                CERROR ("Can't parse type from %s: %d\n",
-                        libcfs_nid2str(conn->ibc_peer->ibp_nid), msg->ibm_type);
-                goto failed;
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                        flags);
         }
-
-        /* schedule for kibnal_rx() in thread context */
-        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
-        
-        list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
-        wake_up (&kibnal_data.kib_sched_waitq);
-        
-        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+        kibnal_handle_rx(rx);
         return;
         
  failed:
         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
-        kibnal_close_conn(conn, -ECONNABORTED);
-
+        kibnal_close_conn(conn, -EIO);
+ ignore:
         /* Don't re-post rx & drop its ref on conn */
-        kibnal_put_conn(conn);
-}
-
-void
-kibnal_rx (kib_rx_t *rx)
-{
-        int          rc = 0;
-        kib_msg_t   *msg = rx->rx_msg;
-
-        /* Clear flag so I can detect if I've sent an RDMA completion */
-        rx->rx_rdma = 0;
-
-        switch (msg->ibm_type) {
-        case IBNAL_MSG_GET_RDMA:
-                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, 
-                                rx->rx_conn->ibc_peer->ibp_nid, rx);
-                break;
-                
-        case IBNAL_MSG_PUT_RDMA:
-                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, 
-                                rx->rx_conn->ibc_peer->ibp_nid, rx);
-                break;
-
-        case IBNAL_MSG_IMMEDIATE:
-                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, 
-                                rx->rx_conn->ibc_peer->ibp_nid, rx);
-                break;
-
-        default:
-                LBUG();
-                break;
-        }
-
-        if (rc < 0) {
-                kibnal_close_conn(rx->rx_conn, rc);
-                kibnal_post_rx (rx, 1);
-        }
+        kibnal_conn_decref(conn);
 }
 
-static struct page *
+struct page *
 kibnal_kvaddr_to_page (unsigned long vaddr)
 {
         struct page *page;
 
         if (vaddr >= VMALLOC_START &&
-            vaddr < VMALLOC_END)
+            vaddr < VMALLOC_END) {
                 page = vmalloc_to_page ((void *)vaddr);
+                LASSERT (page != NULL);
+                return page;
+        }
 #if CONFIG_HIGHMEM
-        else if (vaddr >= PKMAP_BASE &&
-                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
-                page = vmalloc_to_page ((void *)vaddr);
-        /* in 2.4 ^ just walks the page tables */
+        if (vaddr >= PKMAP_BASE &&
+            vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+                /* No highmem pages only used for bulk (kiov) I/O */
+                CERROR("find page for address in highmem\n");
+                LBUG();
+        }
 #endif
-        else
-                page = virt_to_page (vaddr);
-
-        if (!VALID_PAGE (page))
-                page = NULL;
-
+        page = virt_to_page (vaddr);
+        LASSERT (page != NULL);
         return page;
 }
 
-static void
-kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
-                 unsigned long len, int active)
+#if !IBNAL_USE_FMR
+int
+kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, 
+                     unsigned long page_offset, unsigned long len)
 {
-        kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
-        kib_rdma_desc_t *desc;
+        kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
 
-        LASSERTF(ibrm->ibrm_num_descs < LNET_MAX_IOV, "%u\n", 
-                 ibrm->ibrm_num_descs);
+        if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
+                CERROR ("Too many RDMA fragments\n");
+                return -EMSGSIZE;
+        }
 
-        desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
-        if (active)
-                ibrm->rd_key = kibnal_data.kib_md.md_lkey;
-        else
-                ibrm->rd_key = kibnal_data.kib_md.md_rkey;
-        desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
-        desc->rd_addr = kibnal_page2phys(page) + page_offset +
-                        kibnal_data.kib_md.md_addr;
+        if (active) {
+                if (rd->rd_nfrag == 0)
+                        rd->rd_key = kibnal_data.kib_whole_mem.md_lkey;
+        } else {
+                if (rd->rd_nfrag == 0)
+                        rd->rd_key = kibnal_data.kib_whole_mem.md_rkey;
+        }
+
+        frag->rf_nob  = len;
+        frag->rf_addr = kibnal_data.kib_whole_mem.md_addr +
+                        kibnal_page2phys(page) + page_offset;
 
-        ibrm->ibrm_num_descs++;
+        CDEBUG(D_NET,"map key %x frag [%d]["LPX64" for %d]\n", 
+               rd->rd_key, rd->rd_nfrag, frag->rf_addr, frag->rf_nob);
+
+        rd->rd_nfrag++;
+        return 0;
 }
 
-static int
-kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
+int
+kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+                    unsigned int niov, struct iovec *iov, int offset, int nob)
+                 
 {
-        struct page *page;
-        int page_offset, len;
+        int           fragnob;
+        int           rc;
+        unsigned long vaddr;
+        struct page  *page;
+        int           page_offset;
+
+        LASSERT (nob > 0);
+        LASSERT (niov > 0);
+        LASSERT ((rd != tx->tx_rd) == !active);
+
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+                LASSERT (niov > 0);
+        }
 
-        while (nob > 0) {
+        rd->rd_nfrag = 0;
+        do {
+                LASSERT (niov > 0);
+
+                vaddr = ((unsigned long)iov->iov_base) + offset;
+                page_offset = vaddr & (PAGE_SIZE - 1);
                 page = kibnal_kvaddr_to_page(vaddr);
-                if (page == NULL)
+                if (page == NULL) {
+                        CERROR ("Can't find page\n");
                         return -EFAULT;
+                }
 
-                page_offset = vaddr & (PAGE_SIZE - 1);
-                len = min(nob, (int)PAGE_SIZE - page_offset);
-                
-                kibnal_fill_ibrm(tx, page, page_offset, len, active);
-                nob -= len;
-                vaddr += len;
+                fragnob = min((int)(iov->iov_len - offset), nob);
+                fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+                rc = kibnal_append_rdfrag(rd, active, page, 
+                                          page_offset, fragnob);
+                if (rc != 0)
+                        return rc;
+
+                if (offset + fragnob < iov->iov_len) {
+                        offset += fragnob;
+                } else {
+                        offset = 0;
+                        iov++;
+                        niov--;
+                }
+                nob -= fragnob;
+        } while (nob > 0);
+        
+        return 0;
+}
+
+int
+kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+                      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+        int            fragnob;
+        int            rc;
+
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT ((rd != tx->tx_rd) == !active);
+
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
         }
+
+        rd->rd_nfrag = 0;
+        do {
+                LASSERT (nkiov > 0);
+                fragnob = min((int)(kiov->kiov_len - offset), nob);
+                
+                rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
+                                          kiov->kiov_offset + offset,
+                                          fragnob);
+                if (rc != 0)
+                        return rc;
+
+                offset = 0;
+                kiov++;
+                nkiov--;
+                nob -= fragnob;
+        } while (nob > 0);
+
         return 0;
 }
+#else
+int
+kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+               int npages, unsigned long page_offset, int nob)
+{
+        IB_ACCESS_CONTROL access = {0,};
+        FSTATUS           frc;
 
-static int
-kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
-                unsigned int niov, struct iovec *iov, int offset, int nob, int active)
+        LASSERT ((rd != tx->tx_rd) == !active);
+        LASSERT (!tx->tx_md.md_active);
+        LASSERT (tx->tx_md.md_fmrcount > 0);
+        LASSERT (page_offset < PAGE_SIZE);
+        LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
+        LASSERT (npages <= LNET_MAX_IOV);
+
+        if (!active) {
+                // access.s.MWBindable = 1;
+                access.s.LocalWrite = 1;
+                access.s.RdmaWrite = 1;
+        }
+
+        /* Map the memory described by tx->tx_pages
+        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+                                            IBNAL_RDMA_BASE,
+                                            tx->tx_pages, npages,
+                                            page_offset,
+                                            kibnal_data.kib_pd,
+                                            access,
+                                            &tx->tx_md.md_handle,
+                                            &tx->tx_md.md_addr,
+                                            &tx->tx_md.md_lkey,
+                                            &tx->tx_md.md_rkey);
+        */
+        return -EINVAL;
+}
+
+int
+kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+                     unsigned int niov, struct iovec *iov, int offset, int nob)
                  
 {
-        void   *vaddr;
-        FSTATUS frc;
+        int           resid;
+        int           fragnob;
+        struct page  *page;
+        int           npages;
+        unsigned long page_offset;
+        unsigned long vaddr;
 
         LASSERT (nob > 0);
         LASSERT (niov > 0);
-        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
 
         while (offset >= iov->iov_len) {
                 offset -= iov->iov_len;
@@ -635,54 +705,47 @@ kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
                 return (-EMSGSIZE);
         }
 
-        /* our large contiguous iov could be backed by multiple physical
-         * pages. */
-        if (kibnal_whole_mem()) {
-                int rc;
-                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
-                rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + 
-                                         offset, nob, active);
-                if (rc != 0) {
-                        CERROR ("Can't map iov: %d\n", rc);
-                        return rc;
+        vaddr = ((unsigned long)iov->iov_base) + offset;
+        
+        page_offset = vaddr & (PAGE_SIZE - 1);
+        resid = nob;
+        npages = 0;
+
+        do {
+                LASSERT (npages < LNET_MAX_IOV);
+
+                page = kibnal_kvaddr_to_page(vaddr);
+                if (page == NULL) {
+                        CERROR("Can't find page for %lu\n", vaddr);
+                        return -EFAULT;
                 }
-                return 0;
-        }
 
-        vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
-        tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+                tx->tx_pages[npages++] = kibnal_page2phys(page);
 
-        frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob,
-                                   kibnal_data.kib_pd, access,
-                                   &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
-                                   &tx->tx_md.md_rkey);
-        if (frc != 0) {
-                CERROR ("Can't map vaddr %p: %d\n", vaddr, frc);
-                return -EINVAL;
-        }
+                fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
+                vaddr += fragnob;
+                resid -= fragnob;
 
-        tx->tx_mapped = KIB_TX_MAPPED;
-        return (0);
+        } while (resid > 0);
+
+        return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 }
 
-static int
-kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
-                  int nkiov, lnet_kiov_t *kiov,
-                  int offset, int nob, int active)
+int
+kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+                      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
 {
-        __u64                      *phys = NULL;
-        int                         page_offset;
-        int                         nphys;
-        int                         resid;
-        int                         phys_size = 0;
-        FSTATUS                     frc;
-        int                         i, rc = 0;
-
+        int            resid;
+        int            npages;
+        unsigned long  page_offset;
+        
         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 
         LASSERT (nob > 0);
         LASSERT (nkiov > 0);
-        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+        LASSERT (nkiov <= LNET_MAX_IOV);
+        LASSERT (!tx->tx_md.md_active);
+        LASSERT ((rd != tx->tx_rd) == !active);
 
         while (offset >= kiov->kiov_len) {
                 offset -= kiov->kiov_len;
@@ -692,122 +755,36 @@ kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
         }
 
         page_offset = kiov->kiov_offset + offset;
-        nphys = 1;
-
-        if (!kibnal_whole_mem()) {
-                phys_size = nkiov * sizeof (*phys);
-                LIBCFS_ALLOC(phys, phys_size);
-                if (phys == NULL) {
-                        CERROR ("Can't allocate tmp phys\n");
-                        return (-ENOMEM);
-                }
-
-                phys[0] = kibnal_page2phys(kiov->kiov_page);
-        } else {
-                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
-                kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, 
-                                 kiov->kiov_len, active);
-        }
-
-        resid = nob - (kiov->kiov_len - offset);
+        
+        resid = offset + nob;
+        npages = 0;
 
-        while (resid > 0) {
-                kiov++;
-                nkiov--;
+        do {
+                LASSERT (npages < LNET_MAX_IOV);
                 LASSERT (nkiov > 0);
 
-                if (kiov->kiov_offset != 0 ||
-                    ((resid > PAGE_SIZE) && 
-                     kiov->kiov_len < PAGE_SIZE)) {
+                if ((npages > 0 && kiov->kiov_offset != 0) ||
+                    (resid > kiov->kiov_len && 
+                     (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
                         /* Can't have gaps */
                         CERROR ("Can't make payload contiguous in I/O VM:"
-                                "page %d, offset %d, len %d \n", nphys, 
-                                kiov->kiov_offset, kiov->kiov_len);
-
-                        for (i = -nphys; i < nkiov; i++) 
-                        {
-                                CERROR("kiov[%d] %p +%d for %d\n",
-                                       i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
-                        }
+                                "page %d, offset %d, len %d \n",
+                                npages, kiov->kiov_offset, kiov->kiov_len);
                         
-                        rc = -EINVAL;
-                        goto out;
+                        return -EINVAL;
                 }
 
-                if (nphys == LNET_MAX_IOV) {
-                        CERROR ("payload too big (%d)\n", nphys);
-                        rc = -EMSGSIZE;
-                        goto out;
-                }
-
-                if (!kibnal_whole_mem()) {
-                        LASSERT (nphys * sizeof (*phys) < phys_size);
-                        phys[nphys] = kibnal_page2phys(kiov->kiov_page);
-                } else {
-                        if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
-                                CERROR ("payload too big (%d)\n", nphys);
-                                rc = -EMSGSIZE;
-                                goto out;
-                        }
-                        kibnal_fill_ibrm(tx, kiov->kiov_page, 
-                                         kiov->kiov_offset, kiov->kiov_len,
-                                         active);
-                }
-
-                nphys ++;
-                resid -= PAGE_SIZE;
-        }
-
-        if (kibnal_whole_mem())
-                goto out;
-
-#if 0
-        CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
-        for (i = 0; i < nphys; i++)
-                CWARN ("   [%d] "LPX64"\n", i, phys[i]);
-#endif
-
-#if IBNAL_FMR
-#error "iibnal hasn't learned about FMR yet"
-        rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
-                                       phys, nphys,
-                                       &tx->tx_md.md_addr,
-                                       page_offset,
-                                       &tx->tx_md.md_handle.fmr,
-                                       &tx->tx_md.md_lkey,
-                                       &tx->tx_md.md_rkey);
-#else
-        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
-                                            IBNAL_RDMA_BASE,
-                                            phys, nphys,
-                                            0,          /* offset */
-                                            kibnal_data.kib_pd,
-                                            access,
-                                            &tx->tx_md.md_handle,
-                                            &tx->tx_md.md_addr,
-                                            &tx->tx_md.md_lkey,
-                                            &tx->tx_md.md_rkey);
-#endif
-        if (frc == FSUCCESS) {
-                CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
-                       nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
-#if IBNAL_FMR
-                tx->tx_mapped = KIB_TX_MAPPED_FMR;
-#else
-                tx->tx_mapped = KIB_TX_MAPPED;
-#endif
-        } else {
-                CERROR ("Can't map phys: %d\n", frc);
-                rc = -EFAULT;
-        }
+                tx->tx_pages[npages++] = kibnal_page2phys(kiov->kiov_page);
+                resid -= kiov->kiov_len;
+                kiov++;
+                nkiov--;
+        } while (resid > 0);
 
- out:
-        if (phys != NULL)
-                LIBCFS_FREE(phys, phys_size);
-        return (rc);
+        return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 }
+#endif
 
-static kib_conn_t *
+kib_conn_t *
 kibnal_find_conn_locked (kib_peer_t *peer)
 {
         struct list_head *tmp;
@@ -823,111 +800,123 @@ kibnal_find_conn_locked (kib_peer_t *peer)
 void
 kibnal_check_sends (kib_conn_t *conn)
 {
-        unsigned long   flags;
         kib_tx_t       *tx;
+        FSTATUS         frc;
         int             rc;
-        int             i;
         int             done;
+        int             i;
         int             nwork;
-        ENTRY;
 
-        spin_lock_irqsave (&conn->ibc_lock, flags);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+        
+        spin_lock(&conn->ibc_lock);
 
         LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
 
         if (list_empty(&conn->ibc_tx_queue) &&
             conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
-                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                spin_unlock(&conn->ibc_lock);
                 
                 tx = kibnal_get_idle_tx();
                 if (tx != NULL)
                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
 
-                spin_lock_irqsave(&conn->ibc_lock, flags);
+                spin_lock(&conn->ibc_lock);
                 
-                if (tx != NULL) {
-                        atomic_inc(&conn->ibc_refcount);
+                if (tx != NULL)
                         kibnal_queue_tx_locked(tx, conn);
-                }
         }
 
         while (!list_empty (&conn->ibc_tx_queue)) {
                 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
 
+                LASSERT (tx->tx_queued);
                 /* We rely on this for QP sizing */
-                LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+                LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
 
                 LASSERT (conn->ibc_outstanding_credits >= 0);
                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
                 LASSERT (conn->ibc_credits >= 0);
                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
 
-                /* Not on ibc_rdma_queue */
-                LASSERT (!tx->tx_passive_rdma_wait);
-
-                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
-                        GOTO(out, 0);
-
-                if (conn->ibc_credits == 0)     /* no credits */
-                        GOTO(out, 1);
+                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) {
+                        CDEBUG(D_NET, "%s: posted enough\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        break;
+                }
+                
+                if (conn->ibc_credits == 0) {   /* no credits */
+                        CDEBUG(D_NET, "%s: no credits\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        break;
+                }
                 
                 if (conn->ibc_credits == 1 &&   /* last credit reserved for */
-                    conn->ibc_outstanding_credits == 0) /* giving back credits */
-                        GOTO(out, 2);
-
+                    conn->ibc_outstanding_credits == 0) { /* giving back credits */
+                        CDEBUG(D_NET, "%s: not using last credit\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        break;
+                }
+                
                 list_del (&tx->tx_list);
+                tx->tx_queued = 0;
+
+                /* NB don't drop ibc_lock before bumping tx_sending */
 
                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
                     (!list_empty(&conn->ibc_tx_queue) ||
                      conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
                         /* redundant NOOP */
-                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                        spin_unlock(&conn->ibc_lock);
                         kibnal_tx_done(tx);
-                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        spin_lock(&conn->ibc_lock);
+                        CDEBUG(D_NET, "%s: redundant noop\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
                         continue;
                 }
 
-                tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
-                conn->ibc_outstanding_credits = 0;
+                kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
+                                conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
+                                conn->ibc_txseq);
 
+                conn->ibc_txseq++;
+                conn->ibc_outstanding_credits = 0;
                 conn->ibc_nsends_posted++;
                 conn->ibc_credits--;
 
-                /* we only get a tx completion for the final rdma op */ 
-                tx->tx_sending = min(tx->tx_nsp, 2);
-                tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+                /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+                 * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+                 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+                 * and then re-queued here.  It's (just) possible that
+                 * tx_sending is non-zero if we've not done the tx_complete() from
+                 * the first send; hence the ++ rather than = below. */
+                tx->tx_sending++;
+
                 list_add (&tx->tx_list, &conn->ibc_active_txs);
-#if IBNAL_CKSUM
-                tx->tx_msg->ibm_cksum = 0;
-                tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
-                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
-#endif
-                spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
-                /* NB the gap between removing tx from the queue and sending it
-                 * allows message re-ordering to occur */
+                /* Drop the lock while I send (this can re-order sends) */
+                spin_unlock(&conn->ibc_lock);
 
-                LASSERT (tx->tx_nsp > 0);
+                LASSERT (tx->tx_nwrq > 0);
 
                 rc = -ECONNABORTED;
+                frc = FSUCCESS;
                 nwork = 0;
                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
-                        tx->tx_status = 0;
                         /* Driver only accepts 1 item at a time */
-                        for (i = 0; i < tx->tx_nsp; i++) {
-                                hexdump("tx", tx->tx_msg, sizeof(kib_msg_t));
-                                rc = iibt_postsend(conn->ibc_qp, 
-                                                   &tx->tx_wrq[i]);
-                                if (rc != 0)
+                        for (i = 0; i < tx->tx_nwrq; i++) {
+                                frc = iibt_postsend(conn->ibc_qp, 
+                                                    &tx->tx_wrq[i]);
+                                if (frc != FSUCCESS) {
+                                        rc = -EIO;
                                         break;
-                                if (wrq_signals_completion(&tx->tx_wrq[i]))
-                                        nwork++;
+                                }
                                 CDEBUG(D_NET, "posted tx wrq %p\n", 
                                        &tx->tx_wrq[i]);
                         }
                 }
 
-                spin_lock_irqsave (&conn->ibc_lock, flags);
+                spin_lock(&conn->ibc_lock);
                 if (rc != 0) {
                         /* NB credits are transferred in the actual
                          * message, which can only be the last work item */
@@ -936,18 +925,18 @@ kibnal_check_sends (kib_conn_t *conn)
                         conn->ibc_nsends_posted--;
 
                         tx->tx_status = rc;
-                        tx->tx_passive_rdma_wait = 0;
-                        tx->tx_sending -= tx->tx_nsp - nwork;
-
+                        tx->tx_waiting = 0;
+                        tx->tx_sending--;
+                        
                         done = (tx->tx_sending == 0);
                         if (done)
                                 list_del (&tx->tx_list);
                         
-                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        spin_unlock(&conn->ibc_lock);
                         
                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
-                                CERROR ("Error %d posting transmit to %s\n", rc,
-                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                                CERROR ("Error %d posting transmit to %s\n", 
+                                        frc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                         else
                                 CDEBUG (D_NET, "Error %d posting transmit to %s\n",
                                         rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
@@ -958,140 +947,87 @@ kibnal_check_sends (kib_conn_t *conn)
                                 kibnal_tx_done (tx);
                         return;
                 }
-                
         }
 
-        EXIT;
-out:
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+        spin_unlock(&conn->ibc_lock);
 }
 
-static void
-kibnal_tx_callback (IB_WORK_COMPLETION *wc)
+void
+kibnal_tx_complete (IB_WORK_COMPLETION *wc)
 {
         kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
-        kib_conn_t   *conn;
-        unsigned long flags;
+        kib_conn_t   *conn = tx->tx_conn;
+        int           failed = wc->Status != WRStatusSuccess;
         int           idle;
 
-        conn = tx->tx_conn;
-        LASSERT (conn != NULL);
-        LASSERT (tx->tx_sending != 0);
+        CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d status %d\n", 
+               tx, conn, tx->tx_sending, tx->tx_nwrq, wc->Status);
+
+        LASSERT (tx->tx_sending > 0);
 
-        spin_lock_irqsave(&conn->ibc_lock, flags);
+        if (failed &&
+            tx->tx_status == 0 &&
+            conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+                CERROR("tx -> %s type %x cookie "LPX64
+                       "sending %d waiting %d: failed %d\n", 
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                       tx->tx_msg->ibm_type, tx->tx_cookie,
+                       tx->tx_sending, tx->tx_waiting, wc->Status);
 
-        CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
-               tx->tx_sending, tx->tx_nsp, wc->Status);
+        spin_lock(&conn->ibc_lock);
 
         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
-         * gets to free it, which also drops its ref on 'conn'.  If it's
-         * not me, then I take an extra ref on conn so it can't disappear
-         * under me. */
+         * gets to free it, which also drops its ref on 'conn'. */
 
         tx->tx_sending--;
+        conn->ibc_nsends_posted--;
+
+        if (failed) {
+                tx->tx_waiting = 0;
+                tx->tx_status = -EIO;
+        }
+        
         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
-               (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
+               !tx->tx_waiting &&               /* Not waiting for peer */
+               !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
         if (idle)
                 list_del(&tx->tx_list);
 
-        CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
-               conn, conn->ibc_state, 
-               libcfs_nid2str(conn->ibc_peer->ibp_nid),
-               atomic_read (&conn->ibc_refcount));
-        atomic_inc (&conn->ibc_refcount);
-
-        if (tx->tx_sending == 0)
-                conn->ibc_nsends_posted--;
-
-        if (wc->Status != WRStatusSuccess &&
-            tx->tx_status == 0)
-                tx->tx_status = -ECONNABORTED;
-                
-        spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
-        if (idle)
-                kibnal_tx_done (tx);
+        kibnal_conn_addref(conn);               /* 1 ref for me.... */
 
-        if (wc->Status != WRStatusSuccess) {
-                CERROR ("Tx completion to %s failed: %d\n", 
-                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                        wc->Status);
-                kibnal_close_conn (conn, -ENETDOWN);
-        } else {
-                /* can I shovel some more sends out the door? */
-                kibnal_check_sends(conn);
-        }
+        spin_unlock(&conn->ibc_lock);
 
-        kibnal_put_conn (conn);
-}
-
-void 
-kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev)
-{
-        /* XXX flesh out.  this seems largely for async errors */
-        CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
-}
-
-void
-kibnal_ca_callback (void *ca_arg, void *cq_arg)
-{
-        IB_HANDLE cq = *(IB_HANDLE *)cq_arg;
-        IB_HANDLE ca = *(IB_HANDLE *)ca_arg;
-        IB_WORK_COMPLETION wc;
-        int armed = 0;
-
-        CDEBUG(D_NET, "ca %p cq %p\n", ca, cq);
-
-        for(;;) {
-                while (iibt_cq_poll(cq, &wc) == FSUCCESS) {
-
-                        /* We will need to rearm the CQ to avoid a potential race. */
-                        armed = 0;
-                        
-                        if (kibnal_wreqid_is_rx(wc.WorkReqId))
-                                kibnal_rx_callback(&wc);
-                        else
-                                kibnal_tx_callback(&wc);
-                }
-                if (armed)
-                        return;
-                if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) {
-                        CERROR("rearm failed?\n");
-                        return;
-                }
-                armed = 1;
-        }
+        if (idle)
+                kibnal_tx_done (tx);
+
+        if (failed)
+                kibnal_close_conn (conn, -EIO);
+        else
+                kibnal_check_sends(conn);
+
+        kibnal_conn_decref(conn);               /* ...until here */
 }
 
 void
 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
 {
-        IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp];
-        IB_WORK_REQ         *wrq = &tx->tx_wrq[tx->tx_nsp];
-        int                       fence;
-        int                       nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+        IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nwrq];
+        IB_WORK_REQ          *wrq = &tx->tx_wrq[tx->tx_nwrq];
+        int                   nob = offsetof (kib_msg_t, ibm_u) + body_nob;
 
-        LASSERT (tx->tx_nsp >= 0 && 
-                 tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+        LASSERT (tx->tx_nwrq >= 0 && 
+                 tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
         LASSERT (nob <= IBNAL_MSG_SIZE);
-        
-        tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
-        tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
-        tx->tx_msg->ibm_type = type;
-#if IBNAL_CKSUM
-        tx->tx_msg->ibm_nob = nob;
-#endif
-        /* Fence the message if it's bundled with an RDMA read */
-        fence = (tx->tx_nsp > 0) &&
-                (type == IBNAL_MSG_PUT_DONE);
+
+        kibnal_init_msg(tx->tx_msg, type, body_nob);
 
         *gl = (IB_LOCAL_DATASEGMENT) {
-                .Address = tx->tx_vaddr,
+                .Address = tx->tx_hca_msg,
                 .Length  = IBNAL_MSG_SIZE,
-                .Lkey    = kibnal_lkey(kibnal_data.kib_tx_pages),
+                .Lkey    = kibnal_data.kib_whole_mem.md_lkey,
         };
 
-        wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
+        wrq->WorkReqId      = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
         wrq->Operation      = WROpSend;
         wrq->DSList         = gl;
         wrq->DSListDepth    = 1;
@@ -1100,365 +1036,263 @@ kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
         wrq->Req.SendRC.Options.s.SolicitedEvent         = 1;
         wrq->Req.SendRC.Options.s.SignaledCompletion     = 1;
         wrq->Req.SendRC.Options.s.ImmediateData          = 0;
-        wrq->Req.SendRC.Options.s.Fence                  = fence;
-
-        tx->tx_nsp++;
+        wrq->Req.SendRC.Options.s.Fence                  = 0; 
+        /* fence only needed on RDMA reads */
+        
+        tx->tx_nwrq++;
 }
 
-static void
-kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+int
+kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
+                  kib_rdma_desc_t *dstrd, __u64 dstcookie)
 {
-        unsigned long         flags;
+        kib_msg_t            *ibmsg = tx->tx_msg;
+        kib_rdma_desc_t      *srcrd = tx->tx_rd;
+        IB_LOCAL_DATASEGMENT *gl;
+        IB_WORK_REQ          *wrq;
+        int                   rc;
 
-        spin_lock_irqsave(&conn->ibc_lock, flags);
+#if IBNAL_USE_FMR
+        LASSERT (tx->tx_nwrq == 0);
 
-        kibnal_queue_tx_locked (tx, conn);
-        
-        spin_unlock_irqrestore(&conn->ibc_lock, flags);
-        
-        kibnal_check_sends(conn);
-}
+        gl = &tx->tx_gl[0];
+        gl->Length  = nob;
+        gl->Address = srcrd->rd_addr;
+        gl->Lkey    = srcrd->rd_key;
 
-static void
-kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
-{
-        unsigned long    flags;
-        kib_peer_t      *peer;
-        kib_conn_t      *conn;
-        rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
+        wrq = &tx->tx_wrq[0];
 
-        /* If I get here, I've committed to send, so I complete the tx with
-         * failure on any problems */
-        
-        LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
-        LASSERT (tx->tx_nsp > 0);               /* work items have been set up */
+        wrq->WorkReqId      = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
+        wrq->Operation      = WROpRdmaWrite;
+        wrq->DSList         = gl;
+        wrq->DSListDepth    = 1;
+        wrq->MessageLen     = nob;
 
-        read_lock_irqsave(g_lock, flags);
-        
-        peer = kibnal_find_peer_locked (nid);
-        if (peer == NULL) {
-                read_unlock_irqrestore(g_lock, flags);
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
-                return;
-        }
+        wrq->Req.SendRC.ImmediateData                = 0;
+        wrq->Req.SendRC.Options.s.SolicitedEvent     = 0;
+        wrq->Req.SendRC.Options.s.SignaledCompletion = 0;
+        wrq->Req.SendRC.Options.s.ImmediateData      = 0;
+        wrq->Req.SendRC.Options.s.Fence              = 0; 
 
-        conn = kibnal_find_conn_locked (peer);
-        if (conn != NULL) {
-                CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
-                       conn, conn->ibc_state, 
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
-                read_unlock_irqrestore(g_lock, flags);
-                
-                kibnal_queue_tx (tx, conn);
-                return;
-        }
-        
-        /* Making one or more connections; I'll need a write lock... */
-        read_unlock(g_lock);
-        write_lock(g_lock);
+        wrq->Req.SendRC.RemoteDS.Address = dstrd->rd_addr;
+        wrq->Req.SendRC.RemoteDS.Rkey    = dstrd->rd_key;
 
-        peer = kibnal_find_peer_locked (nid);
-        if (peer == NULL) {
-                write_unlock_irqrestore (g_lock, flags);
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
-                return;
-        }
+        tx->tx_nwrq = 1;
+        rc = nob;
+#else
+        /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
+        int              resid = nob;
+        kib_rdma_frag_t *srcfrag;
+        int              srcidx;
+        kib_rdma_frag_t *dstfrag;
+        int              dstidx;
+        int              wrknob;
 
-        conn = kibnal_find_conn_locked (peer);
-        if (conn != NULL) {
-                /* Connection exists; queue message on it */
-                CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
-                       conn, conn->ibc_state, 
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
-                write_unlock_irqrestore (g_lock, flags);
+        /* Called by scheduler */
+        LASSERT (!in_interrupt());
+
+        LASSERT (type == IBNAL_MSG_GET_DONE ||
+                 type == IBNAL_MSG_PUT_DONE);
+
+        srcidx = dstidx = 0;
+        srcfrag = &srcrd->rd_frags[0];
+        dstfrag = &dstrd->rd_frags[0];
+        rc = resid;
+
+        while (resid > 0) {
+                if (srcidx >= srcrd->rd_nfrag) {
+                        CERROR("Src buffer exhausted: %d frags\n", srcidx);
+                        rc = -EPROTO;
+                        break;
+                }
                 
-                kibnal_queue_tx (tx, conn);
-                return;
-        }
+                if (dstidx == dstrd->rd_nfrag) {
+                        CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+                        rc = -EPROTO;
+                        break;
+                }
 
-        if (peer->ibp_connecting == 0) {
-                if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
-                      time_after_eq(jiffies, peer->ibp_reconnect_time))) {
-                        write_unlock_irqrestore (g_lock, flags);
-                        tx->tx_status = -EHOSTUNREACH;
-                        kibnal_tx_done (tx);
-                        return;
+                if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
+                        CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
+                               srcidx, srcrd->rd_nfrag,
+                               dstidx, dstrd->rd_nfrag);
+                        rc = -EMSGSIZE;
+                        break;
                 }
-        
-                peer->ibp_connecting = 1;
-                kib_peer_addref(peer); /* extra ref for connd */
-        
-                spin_lock (&kibnal_data.kib_connd_lock);
-        
-                list_add_tail (&peer->ibp_connd_list,
-                               &kibnal_data.kib_connd_peers);
-                wake_up (&kibnal_data.kib_connd_waitq);
-        
-                spin_unlock (&kibnal_data.kib_connd_lock);
-        }
-        
-        /* A connection is being established; queue the message... */
-        list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
 
-        write_unlock_irqrestore (g_lock, flags);
-}
+                wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
 
-static int
-kibnal_start_passive_rdma (int type, int may_block, lnet_msg_t *lntmsg)
-{
-        lnet_nid_t  nid = lntmsg->msg_target.nid;
-        int         nob = lntmsg->msg_md->md_length;
-        kib_tx_t   *tx;
-        kib_msg_t  *ibmsg;
-        int         rc;
-        IB_ACCESS_CONTROL         access = {0,};
-        
-        LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
-        LASSERT (nob > 0);
-        LASSERT (!in_interrupt());              /* Mapping could block */
+                gl = &tx->tx_gl[tx->tx_nwrq];
+                gl->Length  = wrknob;
+                gl->Address = srcfrag->rf_addr;
+                gl->Lkey    = srcrd->rd_key;
 
-        access.s.MWBindable = 1;
-        access.s.LocalWrite = 1;
-        access.s.RdmaRead = 1;
-        access.s.RdmaWrite = 1;
+                wrq = &tx->tx_wrq[tx->tx_nwrq];
 
-        tx = kibnal_get_idle_tx ();
-        if (tx == NULL) {
-                CERROR("Can't allocate %s txd for %s\n",
-                       (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET",
-                       libcfs_nid2str(nid));
-                return -ENOMEM;
-        }
+                wrq->WorkReqId      = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
+                wrq->Operation      = WROpRdmaWrite;
+                wrq->DSList         = gl;
+                wrq->DSListDepth    = 1;
+                wrq->MessageLen     = nob;
 
-        if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) 
-                rc = kibnal_map_iov (tx, access,
-                                     lntmsg->msg_md->md_niov,
-                                     lntmsg->msg_md->md_iov.iov,
-                                     0, nob, 0);
-        else
-                rc = kibnal_map_kiov (tx, access,
-                                      lntmsg->msg_md->md_niov, 
-                                      lntmsg->msg_md->md_iov.kiov,
-                                      0, nob, 0);
+                wrq->Req.SendRC.ImmediateData                = 0;
+                wrq->Req.SendRC.Options.s.SolicitedEvent     = 0;
+                wrq->Req.SendRC.Options.s.SignaledCompletion = 0;
+                wrq->Req.SendRC.Options.s.ImmediateData      = 0;
+                wrq->Req.SendRC.Options.s.Fence              = 0; 
 
-        if (rc != 0) {
-                CERROR ("Can't map RDMA for %s: %d\n", 
-                        libcfs_nid2str(nid), rc);
-                goto failed;
-        }
-        
-        if (type == IBNAL_MSG_GET_RDMA) {
-                /* reply gets finalized when tx completes */
-                tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, 
-                                                         lntmsg);
-                if (tx->tx_lntmsg[1] == NULL) {
-                        CERROR ("Can't create reply for GET -> %s\n",
-                                libcfs_nid2str(nid));
-                        rc = -ENOMEM;
-                        goto failed;
+                wrq->Req.SendRC.RemoteDS.Address = dstfrag->rf_addr;
+                wrq->Req.SendRC.RemoteDS.Rkey    = dstrd->rd_key;
+
+                resid -= wrknob;
+                if (wrknob < srcfrag->rf_nob) {
+                        srcfrag->rf_addr += wrknob;
+                        srcfrag->rf_nob -= wrknob;
+                } else {
+                        srcfrag++;
+                        srcidx++;
+                }
+                
+                if (wrknob < dstfrag->rf_nob) {
+                        dstfrag->rf_addr += wrknob;
+                        dstfrag->rf_nob -= wrknob;
+                } else {
+                        dstfrag++;
+                        dstidx++;
                 }
+                
+                tx->tx_nwrq++;
         }
+
+        if (rc < 0)                             /* no RDMA if completing with failure */
+                tx->tx_nwrq = 0;
+#endif
         
-        tx->tx_passive_rdma = 1;
+        ibmsg->ibm_u.completion.ibcm_status = rc;
+        ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
 
-        ibmsg = tx->tx_msg;
+        return rc;
+}
 
-        ibmsg->ibm_u.rdma.ibrm_hdr = lntmsg->msg_hdr;
-        ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
-        /* map_kiov alrady filled the rdma descs for the whole_mem case */
-        if (!kibnal_whole_mem()) {
-                ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey;
-                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
-                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
-                ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
-        }
+void
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+        spin_lock(&conn->ibc_lock);
+        kibnal_queue_tx_locked (tx, conn);
+        spin_unlock(&conn->ibc_lock);
+        
+        kibnal_check_sends(conn);
+}
 
-        kibnal_init_tx_msg (tx, type, 
-                            kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
+void
+kibnal_schedule_active_connect_locked (kib_peer_t *peer)
+{
+        /* Called holding kib_global_lock exclusive */
+        unsigned long flags;
 
-        CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
-               LPX64", nob %d\n",
-               tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
-               tx->tx_md.md_addr, nob);
-        
-        /* lntmsg gets finalized when tx completes. */
-        tx->tx_lntmsg[0] = lntmsg;
+        peer->ibp_connecting++;                 /* I'm connecting */
+        kibnal_peer_addref(peer);               /* extra ref for connd */
 
-        kibnal_launch_tx(tx, nid);
-        return (0);
+        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
 
- failed:
-        tx->tx_status = rc;
-        kibnal_tx_done (tx);
-        return (-EIO);
+        list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
+        wake_up (&kibnal_data.kib_connd_waitq);
+
+        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
 }
 
 void
-kibnal_start_active_rdma (int type, int status,
-                          kib_rx_t *rx, lnet_msg_t *lntmsg, 
-                          unsigned int niov,
-                          struct iovec *iov, lnet_kiov_t *kiov,
-                          unsigned int offset, unsigned int nob)
+kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
 {
-        kib_msg_t    *rxmsg = rx->rx_msg;
-        kib_msg_t    *txmsg;
-        kib_tx_t     *tx;
-        IB_ACCESS_CONTROL access = {0,};
-        IB_WR_OP      rdma_op;
-        int           rc;
-        __u32         i;
+        kib_peer_t      *peer;
+        kib_conn_t      *conn;
+        unsigned long    flags;
+        rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
+        int              retry;
+        int              rc;
 
-        CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
-               type, status, niov, offset, nob);
+        /* If I get here, I've committed to send, so I complete the tx with
+         * failure on any problems */
+        
+        LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
+        LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
 
-        /* Called by scheduler */
-        LASSERT (!in_interrupt ());
+        for (retry = 0; ; retry = 1) {
+                read_lock_irqsave(g_lock, flags);
+        
+                peer = kibnal_find_peer_locked (nid);
+                if (peer != NULL) {
+                        conn = kibnal_find_conn_locked (peer);
+                        if (conn != NULL) {
+                                kibnal_conn_addref(conn); /* 1 ref for me... */
+                                read_unlock_irqrestore(g_lock, flags);
+
+                                kibnal_queue_tx (tx, conn);
+                                kibnal_conn_decref(conn); /* ...to here */
+                                return;
+                        }
+                }
+                
+                /* Making one or more connections; I'll need a write lock... */
+                read_unlock(g_lock);
+                write_lock(g_lock);
 
-        /* Either all pages or all vaddrs */
-        LASSERT (!(kiov != NULL && iov != NULL));
+                peer = kibnal_find_peer_locked (nid);
+                if (peer != NULL)
+                        break;
 
-        /* No data if we're completing with failure */
-        LASSERT (status == 0 || nob == 0);
+                write_unlock_irqrestore(g_lock, flags);
 
-        LASSERT (type == IBNAL_MSG_GET_DONE ||
-                 type == IBNAL_MSG_PUT_DONE);
+                if (retry) {
+                        CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
 
-        /* Flag I'm completing the RDMA.  Even if I fail to send the
-         * completion message, I will have tried my best so further
-         * attempts shouldn't be tried. */
-        LASSERT (!rx->rx_rdma);
-        rx->rx_rdma = 1;
+                        tx->tx_status = -EHOSTUNREACH;
+                        tx->tx_waiting = 0;
+                        kibnal_tx_done (tx);
+                        return;
+                }
 
-        if (type == IBNAL_MSG_GET_DONE) {
-                rdma_op  = WROpRdmaWrite;
-                LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
-        } else {
-                access.s.LocalWrite = 1;
-                rdma_op  = WROpRdmaRead;
-                LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
+                rc = kibnal_add_persistent_peer(nid);
+                if (rc != 0) {
+                        CERROR("Can't add peer %s: %d\n",
+                               libcfs_nid2str(nid), rc);
+                        
+                        tx->tx_status = -EHOSTUNREACH;
+                        tx->tx_waiting = 0;
+                        kibnal_tx_done (tx);
+                        return;
+                }
         }
 
-        tx = kibnal_get_idle_tx ();
-        if (tx == NULL) {
-                CERROR ("tx descs exhausted on RDMA from %s"
-                        " completing locally with failure\n",
-                        libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid));
-                lnet_finalize (kibnal_data.kib_ni, lntmsg, -ENOMEM);
+        conn = kibnal_find_conn_locked (peer);
+        if (conn != NULL) {
+                /* Connection exists; queue message on it */
+                kibnal_conn_addref(conn);       /* 1 ref for me... */
+                write_unlock_irqrestore(g_lock, flags);
+                
+                kibnal_queue_tx (tx, conn);
+                kibnal_conn_decref(conn);       /* ...until here */
                 return;
         }
-        LASSERT (tx->tx_nsp == 0);
-                        
-        if (nob == 0) 
-                GOTO(init_tx, 0);
-
-        /* We actually need to transfer some data (the transfer
-         * size could get truncated to zero when the incoming
-         * message is matched) */
-        if (kiov != NULL)
-                rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
-        else
-                rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
-        
-        if (rc != 0) {
-                CERROR ("Can't map RDMA -> %s: %d\n", 
-                        libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid), rc);
-                /* We'll skip the RDMA and complete with failure. */
-                status = rc;
-                nob = 0;
-                GOTO(init_tx, rc);
-        } 
-
-        if (!kibnal_whole_mem()) {
-                tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey;
-                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
-                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
-                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
-        }
-
-        /* XXX ugh.  different page-sized hosts. */ 
-        if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
-            rxmsg->ibm_u.rdma.ibrm_num_descs) {
-                CERROR("tx descs (%u) != rx descs (%u)\n", 
-                       tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
-                       rxmsg->ibm_u.rdma.ibrm_num_descs);
-                /* We'll skip the RDMA and complete with failure. */
-                status = rc;
-                nob = 0;
-                GOTO(init_tx, rc);
-        }
-
-        /* map_kiov filled in the rdma descs which describe our side of the
-         * rdma transfer. */
-        /* ibrm_num_descs was verified in rx_callback */
-        for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
-                kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
-                IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i];
-                IB_WORK_REQ  *wrq = &tx->tx_wrq[i];
-
-                ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
-                rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
-
-                ds->Address = ldesc->rd_addr;
-                ds->Length  = ldesc->rd_nob;
-                ds->Lkey    = tx->tx_msg->ibm_u.rdma.rd_key;
-
-                memset(wrq, 0, sizeof(*wrq));
-                wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
-                wrq->Operation      = rdma_op;
-                wrq->DSList         = ds;
-                wrq->DSListDepth    = 1;
-                wrq->MessageLen     = ds->Length;
-                wrq->Req.SendRC.ImmediateData  = 0;
-                wrq->Req.SendRC.Options.s.SolicitedEvent         = 0;
-                wrq->Req.SendRC.Options.s.SignaledCompletion     = 0;
-                wrq->Req.SendRC.Options.s.ImmediateData          = 0;
-                wrq->Req.SendRC.Options.s.Fence                  = 0;
-                wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr;
-                wrq->Req.SendRC.RemoteDS.Rkey = rxmsg->ibm_u.rdma.rd_key;
 
-                /* only the last rdma post triggers tx completion */
-                if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
-                        wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+        if (peer->ibp_connecting == 0) {
+                if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
+                      time_after_eq(jiffies, peer->ibp_reconnect_time))) {
+                        write_unlock_irqrestore(g_lock, flags);
+                        tx->tx_status = -EHOSTUNREACH;
+                        tx->tx_waiting = 0;
+                        kibnal_tx_done (tx);
+                        return;
+                }
 
-                tx->tx_nsp++;
+                kibnal_schedule_active_connect_locked(peer);
         }
-
-init_tx:
-        txmsg = tx->tx_msg;
-
-        txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
-        txmsg->ibm_u.completion.ibcm_status = status;
         
-        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+        /* A connection is being established; queue the message... */
+        list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
 
-        if (status == 0 && nob != 0) {
-                LASSERT (tx->tx_nsp > 1);
-                /* RDMA: lntmsg gets finalized when the tx completes.  This
-                 * is after the completion message has been sent, which in
-                 * turn is after the RDMA has finished. */
-                tx->tx_lntmsg[0] = lntmsg;
-        } else {
-                LASSERT (tx->tx_nsp == 1);
-                /* No RDMA: local completion happens now! */
-                CDEBUG(D_WARNING,"No data: immediate completion\n");
-                lnet_finalize (kibnal_data.kib_ni, lntmsg,
-                              status == 0 ? 0 : -EIO);
-        }
-
-        /* +1 ref for this tx... */
-        CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
-               rx->rx_conn, rx->rx_conn->ibc_state, 
-               libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid),
-               atomic_read (&rx->rx_conn->ibc_refcount));
-        atomic_inc (&rx->rx_conn->ibc_refcount);
-        /* ...and queue it up */
-        kibnal_queue_tx(tx, rx->rx_conn);
+        write_unlock_irqrestore(g_lock, flags);
 }
 
 int
@@ -1469,7 +1303,7 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
         lnet_process_id_t target = lntmsg->msg_target;
         int               target_is_router = lntmsg->msg_target_is_router;
         int               routing = lntmsg->msg_routing;
-        unsigned int      payload_niov = lntmsg->msg_niov;
+        unsigned int      payload_niov = lntmsg->msg_niov; 
         struct iovec     *payload_iov = lntmsg->msg_iov; 
         lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
         unsigned int      payload_offset = lntmsg->msg_offset;
@@ -1477,17 +1311,18 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
         kib_msg_t        *ibmsg;
         kib_tx_t         *tx;
         int               nob;
+        int               rc;
 
         /* NB 'private' is different depending on what we're sending.... */
 
-        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", 
+        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
                payload_nob, payload_niov, libcfs_id2str(target));
 
         LASSERT (payload_nob == 0 || payload_niov > 0);
         LASSERT (payload_niov <= LNET_MAX_IOV);
 
-        /* Thread context if we're sending payload */
-        LASSERT (!in_interrupt() || payload_nob == 0);
+        /* Thread context */
+        LASSERT (!in_interrupt());
         /* payload is either all vaddrs or all pages */
         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
 
@@ -1503,37 +1338,132 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
         case LNET_MSG_GET:
                 if (routing || target_is_router)
                         break;                  /* send IMMEDIATE */
-
+                
                 /* is the REPLY message too small for RDMA? */
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
                 if (nob <= IBNAL_MSG_SIZE)
                         break;                  /* send IMMEDIATE */
 
-                return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 1, lntmsg);
+                tx = kibnal_get_idle_tx();
+                if (tx == NULL) {
+                        CERROR("Can allocate txd for GET to %s: \n",
+                               libcfs_nid2str(target.nid));
+                        return -ENOMEM;
+                }
+                
+                ibmsg = tx->tx_msg;
+                ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+                ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+
+                if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+                        rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
+                                                 0,
+                                                 lntmsg->msg_md->md_niov,
+                                                 lntmsg->msg_md->md_iov.iov,
+                                                 0, lntmsg->msg_md->md_length);
+                else
+                        rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
+                                                  0,
+                                                  lntmsg->msg_md->md_niov,
+                                                  lntmsg->msg_md->md_iov.kiov,
+                                                  0, lntmsg->msg_md->md_length);
+                if (rc != 0) {
+                        CERROR("Can't setup GET sink for %s: %d\n",
+                               libcfs_nid2str(target.nid), rc);
+                        kibnal_tx_done(tx);
+                        return -EIO;
+                }
+
+#if IBNAL_USE_FMR
+                nob = sizeof(kib_get_msg_t);
+#else
+                {
+                        int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
+                        
+                        nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
+                }
+#endif
+                kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
+
+                tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
+                                                         lntmsg);
+                if (tx->tx_lntmsg[1] == NULL) {
+                        CERROR("Can't create reply for GET -> %s\n",
+                               libcfs_nid2str(target.nid));
+                        kibnal_tx_done(tx);
+                        return -EIO;
+                }
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
+                tx->tx_waiting = 1;             /* waiting for GET_DONE */
+                kibnal_launch_tx(tx, target.nid);
+                return 0;
 
         case LNET_MSG_REPLY: {
                 /* reply's 'private' is the incoming receive */
                 kib_rx_t *rx = private;
 
-                LASSERT (routing || rx != NULL);
+                LASSERT(routing || rx != NULL);
 
-                /* RDMA reply expected? */
                 if (!routing && rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
-                        /* Incoming message consistent with RDMA */
-                        if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_RDMA) {
-                                CERROR ("REPLY to %s bad ibm type %d!!!\n",
-                                        libcfs_nid2str(target.nid), 
-                                        rx->rx_msg->ibm_type);
-                                return (-EIO);
+                        /* Incoming message consistent with RDMA? */
+                        if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
+                                CERROR("REPLY to %s bad msg type %x!!!\n",
+                                       libcfs_nid2str(target.nid), 
+                                       rx->rx_msg->ibm_type);
+                                return -EIO;
                         }
 
-                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
-                                                 rx, lntmsg, payload_niov, 
-                                                 payload_iov, payload_kiov,
-                                                 payload_offset, payload_nob);
-                        return (0);
+                        /* NB handle_rx() will send GET_NAK when I return to
+                         * it from here, unless I set rx_responded! */
+
+                        tx = kibnal_get_idle_tx();
+                        if (tx == NULL) {
+                                CERROR("Can't get tx for REPLY to %s\n",
+                                       libcfs_nid2str(target.nid));
+                                return -ENOMEM;
+                        }
+
+                        if (payload_nob == 0)
+                                rc = 0;
+                        else if (payload_kiov == NULL)
+                                rc = kibnal_setup_rd_iov(
+                                        tx, tx->tx_rd, 1, 
+                                        payload_niov, payload_iov, 
+                                        payload_offset, payload_nob);
+                        else
+                                rc = kibnal_setup_rd_kiov(
+                                        tx, tx->tx_rd, 1,
+                                        payload_niov, payload_kiov,
+                                        payload_offset, payload_nob);
+                        if (rc != 0) {
+                                CERROR("Can't setup GET src for %s: %d\n",
+                                       libcfs_nid2str(target.nid), rc);
+                                kibnal_tx_done(tx);
+                                return -EIO;
+                        }
+                
+                        rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, 
+                                              payload_nob,
+                                              &rx->rx_msg->ibm_u.get.ibgm_rd,
+                                              rx->rx_msg->ibm_u.get.ibgm_cookie);
+                        if (rc < 0) {
+                                CERROR("Can't setup rdma for GET from %s: %d\n", 
+                                       libcfs_nid2str(target.nid), rc);
+                        } else if (rc == 0) {
+                                /* No RDMA: local completion may happen now! */
+                                lnet_finalize (kibnal_data.kib_ni, lntmsg, 0);
+                        } else {
+                                /* RDMA: lnet_finalize(lntmsg) when it
+                                 * completes */
+                                tx->tx_lntmsg[0] = lntmsg;
+                        }
+
+                        kibnal_queue_tx(tx, rx->rx_conn);
+                        rx->rx_responded = 1;
+                        return (rc >= 0) ? 0 : -EIO;
                 }
-                /* Fall through to handle like PUT */
+                /* fall through to handle like PUT */
         }
 
         case LNET_MSG_PUT:
@@ -1541,73 +1471,103 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
                 if (nob <= IBNAL_MSG_SIZE)
                         break;                  /* send IMMEDIATE */
-                
-                return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
-                                                 !(routing || type == LNET_MSG_REPLY),
-                                                 lntmsg);
+
+                tx = kibnal_get_idle_tx();
+                if (tx == NULL) {
+                        CERROR("Can't allocate %s txd for %s\n",
+                               type == LNET_MSG_PUT ? "PUT" : "REPLY",
+                               libcfs_nid2str(target.nid));
+                        return -ENOMEM;
+                }
+
+                if (payload_kiov == NULL)
+                        rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 1,
+                                                 payload_niov, payload_iov,
+                                                 payload_offset, payload_nob);
+                else
+                        rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 1,
+                                                  payload_niov, payload_kiov,
+                                                  payload_offset, payload_nob);
+                if (rc != 0) {
+                        CERROR("Can't setup PUT src for %s: %d\n",
+                               libcfs_nid2str(target.nid), rc);
+                        kibnal_tx_done(tx);
+                        return -EIO;
+                }
+
+                ibmsg = tx->tx_msg;
+                ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+                ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+                kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
+                kibnal_launch_tx(tx, target.nid);
+                return 0;
         }
 
-        /* Send IMMEDIATE */
+        /* send IMMEDIATE */
+
+        LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+                 <= IBNAL_MSG_SIZE);
 
         tx = kibnal_get_idle_tx();
         if (tx == NULL) {
-                CERROR ("Can't send %d to %s: tx descs exhausted%s\n", 
-                        type, libcfs_nid2str(target.nid), 
-                        in_interrupt() ? " (intr)" : "");
-                return (-ENOMEM);
+                CERROR ("Can't send %d to %s: tx descs exhausted\n",
+                        type, libcfs_nid2str(target.nid));
+                return -ENOMEM;
         }
 
         ibmsg = tx->tx_msg;
         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
 
         if (payload_kiov != NULL)
-                lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg, 
+                lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
                                     offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
-                                    payload_niov, payload_kiov, 
+                                    payload_niov, payload_kiov,
                                     payload_offset, payload_nob);
         else
-                lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg, 
+                lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
-                                   payload_niov, payload_iov, 
+                                   payload_niov, payload_iov,
                                    payload_offset, payload_nob);
 
-        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
-                            offsetof(kib_immediate_msg_t, 
-                                     ibim_payload[payload_nob]));
-
-        /* lntmsg gets finalized when tx completes */
-        tx->tx_lntmsg[0] = lntmsg;
+        nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
 
+        tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
         kibnal_launch_tx(tx, target.nid);
-        return (0);
+        return 0;
 }
 
 int
-kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
-             int delayed, unsigned int niov, 
-             struct iovec *iov, lnet_kiov_t *kiov, 
+kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+             unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
              unsigned int offset, unsigned int mlen, unsigned int rlen)
 {
         kib_rx_t    *rx = private;
         kib_msg_t   *rxmsg = rx->rx_msg;
-        int          msg_nob;
+        kib_conn_t  *conn = rx->rx_conn;
+        kib_tx_t    *tx;
+        kib_msg_t   *txmsg;
+        int          nob;
         int          rc = 0;
-
+        
         LASSERT (mlen <= rlen);
-        LASSERT (!in_interrupt ());
+        LASSERT (!in_interrupt());
         /* Either all pages or all vaddrs */
         LASSERT (!(kiov != NULL && iov != NULL));
 
         switch (rxmsg->ibm_type) {
         default:
                 LBUG();
-
+                
         case IBNAL_MSG_IMMEDIATE:
-                msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
-                if (msg_nob > rx->rx_nob) {
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (nob > rx->rx_nob) {
                         CERROR ("Immediate message from %s too big: %d(%d)\n",
                                 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
-                                msg_nob, rx->rx_nob);
+                                nob, rx->rx_nob);
                         rc = -EPROTO;
                         break;
                 }
@@ -1618,41 +1578,84 @@ kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
                                             offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
                                             mlen);
                 else
-                        lnet_copy_flat2iov(niov, iov, offset,
-                                           IBNAL_MSG_SIZE, rxmsg,
-                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
-                                           mlen);
+                        lnet_copy_flat2iov(niov, iov, offset,
+                                           IBNAL_MSG_SIZE, rxmsg,
+                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                           mlen);
+                lnet_finalize (ni, lntmsg, 0);
+                break;
+
+        case IBNAL_MSG_PUT_REQ:
+                if (mlen == 0) {
+                        lnet_finalize(ni, lntmsg, 0);
+                        kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
+                                               rxmsg->ibm_u.putreq.ibprm_cookie);
+                        break;
+                }
+                
+                tx = kibnal_get_idle_tx();
+                if (tx == NULL) {
+                        CERROR("Can't allocate tx for %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        /* Not replying will break the connection */
+                        rc = -ENOMEM;
+                        break;
+                }
+
+                txmsg = tx->tx_msg;
+                if (kiov == NULL)
+                        rc = kibnal_setup_rd_iov(tx, 
+                                                 &txmsg->ibm_u.putack.ibpam_rd,
+                                                 0,
+                                                 niov, iov, offset, mlen);
+                else
+                        rc = kibnal_setup_rd_kiov(tx,
+                                                  &txmsg->ibm_u.putack.ibpam_rd,
+                                                  0,
+                                                  niov, kiov, offset, mlen);
+                if (rc != 0) {
+                        CERROR("Can't setup PUT sink for %s: %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                        kibnal_tx_done(tx);
+                        /* tell peer it's over */
+                        kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc,
+                                               rxmsg->ibm_u.putreq.ibprm_cookie);
+                        break;
+                }
 
-                lnet_finalize (ni, lntmsg, 0);
+                txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+                txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+#if IBNAL_USE_FMR
+                nob = sizeof(kib_putack_msg_t);
+#else
+                {
+                        int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
+
+                        nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+                }
+#endif
+                kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_DONE */
+                kibnal_queue_tx(tx, conn);
                 break;
 
-        case IBNAL_MSG_GET_RDMA:
+        case IBNAL_MSG_GET_REQ:
                 LASSERT (lntmsg == NULL);       /* no need to finalise */
-                if (!rx->rx_rdma) {
+                if (!rx->rx_responded) {
                         /* GET didn't match anything */
-                        kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -ENODATA,
-                                                  rx, NULL, 0, NULL, NULL, 0, 0);
+                        kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, 
+                                               -ENODATA,
+                                               rxmsg->ibm_u.get.ibgm_cookie);
                 }
                 break;
-
-        case IBNAL_MSG_PUT_RDMA:
-                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, rx, lntmsg, 
-                                          niov, iov, kiov, offset, mlen);
-                break;
         }
 
         kibnal_post_rx(rx, 1);
         return rc;
 }
 
-/*****************************************************************************
- * the rest of this file concerns connection management.  active connetions
- * start with connect_peer, passive connections start with passive_callback.
- * active disconnects start with conn_close, cm_callback starts passive
- * disconnects and contains the guts of how the disconnect state machine
- * progresses. 
- *****************************************************************************/
-
 int
 kibnal_thread_start (int (*fn)(void *arg), void *arg)
 {
@@ -1665,45 +1668,82 @@ kibnal_thread_start (int (*fn)(void *arg), void *arg)
         return (0);
 }
 
-static void
+void
 kibnal_thread_fini (void)
 {
         atomic_dec (&kibnal_data.kib_nthreads);
 }
 
-/* this can be called by anyone at any time to close a connection.  if
- * the connection is still established it heads to the connd to start
- * the disconnection in a safe context.  It has no effect if called
- * on a connection that is already disconnecting */
 void
-kibnal_close_conn_locked (kib_conn_t *conn, int error)
+kibnal_schedule_conn (kib_conn_t *conn)
 {
-        /* This just does the immmediate housekeeping, and schedules the
-         * connection for the connd to finish off.
-         * Caller holds kib_global_lock exclusively in irq context */
-        kib_peer_t   *peer = conn->ibc_peer;
+        unsigned long flags;
 
-        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
-                                    IBNAL_CONN_DISCONNECTED);
+        kibnal_conn_addref(conn);               /* ++ref for connd */
+        
+        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
 
-        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
-                return; /* already disconnecting */
+        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+                
+        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+}
 
-        CDEBUG (error == 0 ? D_NET : D_ERROR,
-                "closing conn to %s: error %d\n", 
-                libcfs_nid2str(peer->ibp_nid), error);
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+        /* This just does the immediate housekeeping to start shutdown of an
+         * established connection.  'error' is zero for a normal shutdown.
+         * Caller holds kib_global_lock exclusively in irq context */
+        kib_peer_t       *peer = conn->ibc_peer;
+        
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 
-        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
-                /* kib_connd_conns takes ibc_list's ref */
-                list_del (&conn->ibc_list);
+        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+                return; /* already being handled  */
+        
+        /* NB Can't take ibc_lock here (could be in IRQ context), without
+         * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
+
+        if (error == 0 &&
+            list_empty(&conn->ibc_tx_queue) &&
+            list_empty(&conn->ibc_active_txs)) {
+                CDEBUG(D_NET, "closing conn to %s"
+                       " rx# "LPD64" tx# "LPD64"\n", 
+                       libcfs_nid2str(peer->ibp_nid),
+                       conn->ibc_txseq, conn->ibc_rxseq);
         } else {
-                /* new ref for kib_connd_conns */
-                CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
-                       conn, conn->ibc_state, 
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount);
+                CERROR("Closing conn to %s: error %d%s%s"
+                       " rx# "LPD64" tx# "LPD64"\n",
+                       libcfs_nid2str(peer->ibp_nid), error,
+                       list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+                       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
+                       conn->ibc_txseq, conn->ibc_rxseq);
+#if 0
+                /* can't skip down the queue without holding ibc_lock (see above) */
+                list_for_each(tmp, &conn->ibc_tx_queue) {
+                        kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+                        
+                        CERROR("   queued tx type %x cookie "LPX64
+                               " sending %d waiting %d ticks %ld/%d\n", 
+                               tx->tx_msg->ibm_type, tx->tx_cookie, 
+                               tx->tx_sending, tx->tx_waiting,
+                               (long)(tx->tx_deadline - jiffies), HZ);
+                }
+
+                list_for_each(tmp, &conn->ibc_active_txs) {
+                        kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+                        
+                        CERROR("   active tx type %x cookie "LPX64
+                               " sending %d waiting %d ticks %ld/%d\n", 
+                               tx->tx_msg->ibm_type, tx->tx_cookie, 
+                               tx->tx_sending, tx->tx_waiting,
+                               (long)(tx->tx_deadline - jiffies), HZ);
+                }
+#endif
         }
+
+        list_del (&conn->ibc_list);
         
         if (list_empty (&peer->ibp_conns) &&    /* no more conns */
             peer->ibp_persistence == 0 &&       /* non-persistent peer */
@@ -1711,29 +1751,119 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
                 kibnal_unlink_peer_locked (peer);
         }
 
-        conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+        kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTING);
 
-        spin_lock (&kibnal_data.kib_connd_lock);
+        kibnal_schedule_conn(conn);
+        kibnal_conn_decref(conn);               /* lose ibc_list's ref */
+}
 
-        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
-        wake_up (&kibnal_data.kib_connd_waitq);
+void
+kibnal_close_conn (kib_conn_t *conn, int error)
+{
+        unsigned long flags;
+        
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+        kibnal_close_conn_locked (conn, error);
+        
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+}
+
+void
+kibnal_handle_early_rxs(kib_conn_t *conn)
+{
+        unsigned long    flags;
+        kib_rx_t        *rx;
+
+        LASSERT (!in_interrupt());
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+        
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        while (!list_empty(&conn->ibc_early_rxs)) {
+                rx = list_entry(conn->ibc_early_rxs.next,
+                                kib_rx_t, rx_list);
+                list_del(&rx->rx_list);
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
                 
-        spin_unlock (&kibnal_data.kib_connd_lock);
+                kibnal_handle_rx(rx);
+                
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        }
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 }
 
 void
-kibnal_close_conn (kib_conn_t *conn, int error)
+kibnal_conn_disconnected(kib_conn_t *conn)
 {
-        unsigned long     flags;
+        static IB_QP_ATTRIBUTES_MODIFY qpam = {.RequestState = QPStateError};
 
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        LIST_HEAD        (zombies); 
+        struct list_head *tmp;
+        struct list_head *nxt;
+        kib_tx_t         *tx;
+        FSTATUS           frc;
+        int               done;
 
-        kibnal_close_conn_locked (conn, error);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP);
+
+        kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
+
+        /* move QP to error state to make posted work items complete */
+        frc = iibt_qp_modify(conn->ibc_qp, &qpam, NULL);
+        if (frc != FSUCCESS)
+                CERROR("can't move qp state to error: %d\n", frc);
+
+        spin_lock(&conn->ibc_lock);
+
+        /* Complete all tx descs not waiting for sends to complete.
+         * NB we should be safe from RDMA now that the QP has changed state */
+
+        list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_queued);
+
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_queued = 0;
+                tx->tx_waiting = 0;
+                
+                if (tx->tx_sending != 0)
+                        continue;
+
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+
+        list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                LASSERT (!tx->tx_queued);
+                LASSERT (tx->tx_waiting ||
+                         tx->tx_sending != 0);
+
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_waiting = 0;
+                
+                if (tx->tx_sending != 0)
+                        continue;
+
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
         
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        spin_unlock(&conn->ibc_lock);
+
+        while (!list_empty(&zombies)) {
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                list_del(&tx->tx_list);
+                kibnal_tx_done (tx);
+        }
+
+        kibnal_handle_early_rxs(conn);
 }
 
-static void
+void
 kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
 {
         LIST_HEAD        (zombies);
@@ -1742,13 +1872,14 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
 
         LASSERT (rc != 0);
 
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
         LASSERT (peer->ibp_connecting != 0);
         peer->ibp_connecting--;
 
         if (peer->ibp_connecting != 0) {
-                /* another connection attempt under way (loopback?)... */
+                /* another connection attempt under way (e.g. STALE on first
+                 * attempt)... */
                 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
                 return;
         }
@@ -1763,18 +1894,12 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
                         MIN(peer->ibp_reconnect_interval,
                             *kibnal_tunables.kib_max_reconnect_interval);
                 
-                peer->ibp_reconnect_time = jiffies +
+                peer->ibp_reconnect_time = jiffies + 
                                            peer->ibp_reconnect_interval * HZ;
-        
-                /* Take peer's blocked blocked transmits; I'll complete
-                 * them with error */
-                while (!list_empty (&peer->ibp_tx_queue)) {
-                        tx = list_entry (peer->ibp_tx_queue.next,
-                                         kib_tx_t, tx_list);
-                        
-                        list_del (&tx->tx_list);
-                        list_add_tail (&tx->tx_list, &zombies);
-                }
+
+                /* Take peer's blocked transmits to complete with error */
+                list_add(&zombies, &peer->ibp_tx_queue);
+                list_del_init(&peer->ibp_tx_queue);
                 
                 if (kibnal_peer_active(peer) &&
                     (peer->ibp_persistence == 0)) {
@@ -1786,169 +1911,274 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
                 LASSERT (list_empty(&peer->ibp_tx_queue));
         }
         
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
-
-        if (!list_empty (&zombies))
-                CERROR ("Deleting messages for %s: connection failed\n",
-                        libcfs_nid2str(peer->ibp_nid));
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-        while (!list_empty (&zombies)) {
+        if (list_empty (&zombies))
+                return;
+        
+        CERROR ("Deleting messages for %s: connection failed\n",
+                libcfs_nid2str(peer->ibp_nid));
+        do {
                 tx = list_entry (zombies.next, kib_tx_t, tx_list);
 
                 list_del (&tx->tx_list);
                 /* complete now */
                 tx->tx_status = -EHOSTUNREACH;
                 kibnal_tx_done (tx);
-        }
+        } while (!list_empty (&zombies));
 }
 
-static void
+void
 kibnal_connreq_done (kib_conn_t *conn, int active, int status)
 {
-        int               state = conn->ibc_state;
         kib_peer_t       *peer = conn->ibc_peer;
+        struct list_head  txs;
         kib_tx_t         *tx;
         unsigned long     flags;
         int               i;
 
-        /* passive connection has no connreq & vice versa */
-        LASSERTF(!active == !(conn->ibc_connreq != NULL),
-                 "%d %p\n", active, conn->ibc_connreq);
-        if (active) {
-                LIBCFS_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
-                conn->ibc_connreq = NULL;
+        LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP);
+        LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
+        LASSERT (peer->ibp_connecting > 0);
+
+        LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
+        conn->ibc_cvars = NULL;
+
+        if (status != 0) {
+                /* failed to establish connection */
+                kibnal_peer_connect_failed(conn->ibc_peer, active, status);
+                kibnal_conn_disconnected(conn);
+                kibnal_conn_decref(conn);       /* Lose CM's ref */
+                return;
         }
 
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        /* connection established */
+        LASSERT(conn->ibc_state == IBNAL_CONN_CONNECTING);
+        kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
 
-        LASSERT (peer->ibp_connecting != 0);
-        
-        if (status == 0) {                         
-                /* connection established... */
-                KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
-                conn->ibc_state = IBNAL_CONN_ESTABLISHED;
-
-                if (!kibnal_peer_active(peer)) {
-                        /* ...but peer deleted meantime */
-                        status = -ECONNABORTED;
-                }
-        } else {
-                KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
-                                            IBNAL_CONN_CONNECTING);
+        CDEBUG(D_WARNING, "Connection %p -> %s ESTABLISHED\n",
+               conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+        if (!kibnal_peer_active(peer)) {
+                /* peer has been deleted */
+                kibnal_close_conn_locked(conn, -ECONNABORTED);
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+                kibnal_peer_connect_failed(conn->ibc_peer, active, 
+                                           -ECONNABORTED);
+                kibnal_conn_decref(conn);       /* lose CM's ref */
+                return;
         }
+        
+        peer->ibp_connecting--;
+        peer->ibp_reconnect_interval = 0;       /* OK to reconnect at any time */
 
-        if (status == 0) {
-                /* Everything worked! */
+        /* Add conn to peer's list and nuke any dangling conns from a different
+         * peer instance... */
+        kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
+        kibnal_close_stale_conns_locked(peer, conn->ibc_incarnation);
 
-                peer->ibp_connecting--;
+        /* grab txs blocking for a conn */
+        list_add(&txs, &peer->ibp_tx_queue);
+        list_del_init(&peer->ibp_tx_queue);
 
-                /* +1 ref for ibc_list; caller(== CM)'s ref remains until
-                 * the IB_CM_IDLE callback */
-                CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
-                       conn, conn->ibc_state, 
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount);
-                list_add (&conn->ibc_list, &peer->ibp_conns);
-                
-                peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+        
+        /* Schedule blocked txs */
+        spin_lock (&conn->ibc_lock);
+        while (!list_empty (&txs)) {
+                tx = list_entry (txs.next, kib_tx_t, tx_list);
+                list_del (&tx->tx_list);
 
-                /* post blocked sends to the new connection */
-                spin_lock (&conn->ibc_lock);
-                
-                while (!list_empty (&peer->ibp_tx_queue)) {
-                        tx = list_entry (peer->ibp_tx_queue.next, 
-                                         kib_tx_t, tx_list);
-                        
-                        list_del (&tx->tx_list);
-
-                        /* +1 ref for each tx */
-                        CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
-                               conn, conn->ibc_state, 
-                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                               atomic_read (&conn->ibc_refcount));
-                        atomic_inc (&conn->ibc_refcount);
-                        kibnal_queue_tx_locked (tx, conn);
+                kibnal_queue_tx_locked (tx, conn);
+        }
+        spin_unlock (&conn->ibc_lock);
+        kibnal_check_sends (conn);
+}
+
+void
+kibnal_reject (lnet_nid_t nid, IB_HANDLE cep, int reason)
+{
+        static CM_REJECT_INFO msgs[] = {{.Reason = RC_USER_REJ},
+                                        {.Reason = RC_NO_RESOURCES}};
+        const int       nmsg = sizeof(msgs)/sizeof(msgs[0]);
+        CM_REJECT_INFO *msg;
+        FSTATUS         frc;
+
+        for (msg = &msgs[0]; msg < &msgs[nmsg]; msg++)
+                if (msg->Reason == reason)
+                        break;
+        
+        LASSERT (msg < &msgs[nmsg]);
+        
+        frc = iibt_cm_reject(cep, msg);
+        if (frc != FSUCCESS)
+                CERROR("Error %d rejecting %s\n", frc, libcfs_nid2str(nid));
+}
+
+void
+kibnal_check_connreject(kib_conn_t *conn, int active, CM_REJECT_INFO *rej)
+{
+        kib_peer_t *peer = conn->ibc_peer;
+        unsigned    long flags;
+        FSTATUS     frc;
+
+        if (rej->Reason != RC_STALE_CONN) {
+                CERROR("%s connection to %s rejected: %d\n",
+                       active ? "Active" : "Passive",
+                       libcfs_nid2str(peer->ibp_nid), rej->Reason);
+        } else {
+                if (!active) {
+                        CERROR("Connection to %s rejected (stale QP)\n",
+                               libcfs_nid2str(peer->ibp_nid));
+                } else {
+                        CWARN("Connection to %s rejected (stale QP): "
+                              "retrying...\n", libcfs_nid2str(peer->ibp_nid));
+
+                        /* retry from scratch to allocate a new conn 
+                         * which will use a different QP */
+                        write_lock_irqsave(&kibnal_data.kib_global_lock, 
+                                           flags);
+                        kibnal_schedule_active_connect_locked(peer);
+                        write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                                flags);
                 }
-                
-                spin_unlock (&conn->ibc_lock);
 
-                /* Nuke any dangling conns from a different peer instance... */
-                kibnal_close_stale_conns_locked (conn->ibc_peer,
-                                                 conn->ibc_incarnation);
+                /* An FCM_DISCONNECTED callback is still outstanding: give it a
+                 * ref since kibnal_connreq_done() drops the CM's ref on conn
+                 * on failure */
+                kibnal_conn_addref(conn);
+        }
+        
+        kibnal_connreq_done(conn, 1, -ECONNRESET);
+}
 
-                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+void
+kibnal_cm_disconnect_callback(kib_conn_t *conn, CM_CONN_INFO *info)
+{
+        CDEBUG(D_NET, "status 0x%x\n", info->Status);
 
-                /* queue up all the receives */
-                for (i = 0; i < IBNAL_RX_MSGS; i++) {
-                        /* +1 ref for rx desc */
-                        CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
-                               conn, conn->ibc_state, 
-                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                               atomic_read (&conn->ibc_refcount));
-                        atomic_inc (&conn->ibc_refcount);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 
-                        CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
-                               i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
-                               conn->ibc_rxs[i].rx_vaddr);
+        switch (info->Status) {
+        default:
+                LBUG();
+                break;
 
-                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
-                }
+        case FCM_DISCONNECT_REPLY:
+                /* You can't get this if you set TIMEWAIT */
+                CERROR("Unexpected FCM_DISCONNECT_REPLY for %s\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                LBUG();
+                break;
+                
+        case FCM_DISCONNECT_REQUEST:
+                /* Schedule conn to iibt_cm_disconnect() if it wasn't already */
+                kibnal_close_conn (conn, 0);
+                break;
 
-                kibnal_check_sends (conn);
-                return;
+        case FCM_DISCONNECTED:
+                CDEBUG(D_NET, "Connection %p -> %s disconnected.\n",
+                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kibnal_conn_decref(conn);       /* Lose CM's ref */
+                break;
         }
+}
 
-        /* connection failed */
-        if (state == IBNAL_CONN_CONNECTING) {
-                /* schedule for connd to close */
-                kibnal_close_conn_locked (conn, status);
-        } else {
-                /* Don't have a CM comm_id; just wait for refs to drain */
-                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
-        } 
+void
+kibnal_cm_passive_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        kib_conn_t       *conn = arg;
 
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        CDEBUG(D_NET, "status 0x%x\n", info->Status);
+        kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
+
+        /* Established Connection Notifier */
+        switch (info->Status) {
+        default:
+                CERROR("Unexpected status %d on Connection %p -> %s\n",
+                       info->Status, conn, 
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                LBUG();
+                break;
+
+        case FCM_CONNECT_TIMEOUT:
+                kibnal_connreq_done(conn, 0, -ETIMEDOUT);
+                break;
+                
+        case FCM_CONNECT_REJECT:
+                kibnal_check_connreject(conn, 0, &info->Info.Reject);
+                break;
 
-        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+        case FCM_CONNECT_ESTABLISHED:
+                kibnal_connreq_done(conn, 0, 0);
+                break;
 
-        /* If we didn't establish the connection we don't have to pass
-         * through the disconnect protocol before dropping the CM ref */
-        if (state < IBNAL_CONN_CONNECTING) 
-                kibnal_put_conn (conn);
+        case FCM_DISCONNECT_REQUEST:
+        case FCM_DISCONNECT_REPLY:
+        case FCM_DISCONNECTED:
+                kibnal_cm_disconnect_callback(conn, info);
+                break;
+        }
 }
 
-static int
-kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
-                lnet_nid_t nid, __u64 incarnation, int queue_depth)
+int
+kibnal_accept (kib_conn_t **connp, kib_msg_t *msg, int nob)
 {
-        kib_conn_t    *conn = kibnal_create_conn();
+        lnet_nid_t     nid;
+        kib_conn_t    *conn;
         kib_peer_t    *peer;
         kib_peer_t    *peer2;
         unsigned long  flags;
+        int            rc;
 
-        if (conn == NULL)
-                return (-ENOMEM);
+        rc = kibnal_unpack_msg(msg, nob);
+        if (rc != 0) {
+                CERROR("Error %d unpacking connreq\n", rc);
+                return -EPROTO;
+        }
 
-        if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
-                CERROR("Can't accept %s: bad queue depth %d (%d expected)\n",
+        nid = msg->ibm_srcnid;
+
+        if (msg->ibm_type != IBNAL_MSG_CONNREQ) {
+                CERROR("Can't accept %s: bad request type %d (%d expected)\n",
+                       libcfs_nid2str(nid), msg->ibm_type, IBNAL_MSG_CONNREQ);
+                return -EPROTO;
+        }
+        
+        if (msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid) {
+                CERROR("Can't accept %s: bad dst NID %s (%s expected)\n",
                        libcfs_nid2str(nid), 
-                       queue_depth, IBNAL_MSG_QUEUE_SIZE);
-                atomic_dec (&conn->ibc_refcount);
-                kibnal_destroy_conn(conn);
-                return (-EPROTO);
+                       libcfs_nid2str(msg->ibm_dstnid), 
+                       libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+                return -EPROTO;
         }
         
+        if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE ||
+            msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE ||
+            msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
+                CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n",
+                       libcfs_nid2str(nid), 
+                       msg->ibm_u.connparams.ibcp_queue_depth,
+                       msg->ibm_u.connparams.ibcp_max_msg_size,
+                       msg->ibm_u.connparams.ibcp_max_frags,
+                       IBNAL_MSG_QUEUE_SIZE,
+                       IBNAL_MSG_SIZE,
+                       IBNAL_MAX_RDMA_FRAGS);
+                return -EPROTO;
+        }
+
+        conn = kibnal_create_conn(nid);
+        if (conn == NULL)
+                return -ENOMEM;
+
         /* assume 'nid' is a new peer */
-        peer = kibnal_create_peer (nid);
-        if (peer == NULL) {
-                CDEBUG(D_NET, "--conn[%p] state %d -> %s (%d)\n",
-                       conn, conn->ibc_state, 
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                       atomic_read (&conn->ibc_refcount));
-                atomic_dec (&conn->ibc_refcount);
-                kibnal_destroy_conn(conn);
-                return (-ENOMEM);
+        rc = kibnal_create_peer(&peer, nid);
+        if (rc != 0) {
+                kibnal_conn_decref(conn);
+                return rc;
         }
         
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
@@ -1958,459 +2188,234 @@ kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
                 /* peer table takes my ref on peer */
                 list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
         } else {
-                kib_peer_decref (peer);
+                kibnal_peer_decref(peer);
                 peer = peer2;
         }
 
-        kib_peer_addref(peer); /* +1 ref for conn */
+        kibnal_peer_addref(peer); /* +1 ref for conn */
         peer->ibp_connecting++;
 
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
-
+        kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
         conn->ibc_peer = peer;
-        conn->ibc_state = IBNAL_CONN_CONNECTING;
-        /* conn->ibc_cep is set when cm_accept is called */
-        conn->ibc_incarnation = incarnation;
+        conn->ibc_incarnation = msg->ibm_srcstamp;
         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
 
-        *connp = conn;
-        return (0);
-}
-
-static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state)
-{
-        IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,};
-        FSTATUS frc;
-
-        modify_attr.RequestState = state;
-
-        frc = iibt_qp_modify(qp, &modify_attr, NULL);
-        if (frc != FSUCCESS)
-                CERROR("couldn't set qp state to %d, error %d\n", state, frc);
-}
-
-static void kibnal_flush_pending(kib_conn_t *conn)
-{
-        LIST_HEAD        (zombies); 
-        struct list_head *tmp;
-        struct list_head *nxt;
-        kib_tx_t         *tx;
-        unsigned long     flags;
-        int               done;
-
-        /* NB we wait until the connection has closed before completing
-         * outstanding passive RDMAs so we can be sure the network can't 
-         * touch the mapped memory any more. */
-        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
-
-        /* set the QP to the error state so that we get flush callbacks
-         * on our posted receives which can then drop their conn refs */
-        kibnal_set_qp_state(conn->ibc_qp, QPStateError);
-
-        spin_lock_irqsave (&conn->ibc_lock, flags);
-
-        /* grab passive RDMAs not waiting for the tx callback */
-        list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
-                tx = list_entry (tmp, kib_tx_t, tx_list);
-
-                LASSERT (tx->tx_passive_rdma ||
-                         !tx->tx_passive_rdma_wait);
-
-                LASSERT (tx->tx_passive_rdma_wait ||
-                         tx->tx_sending != 0);
-
-                /* still waiting for tx callback? */
-                if (!tx->tx_passive_rdma_wait)
-                        continue;
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
-                tx->tx_status = -ECONNABORTED;
-                tx->tx_passive_rdma_wait = 0;
-                done = (tx->tx_sending == 0);
+        *connp = conn;
+        return (0);
+}
 
-                if (!done)
-                        continue;
+void
+kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
 
-                list_del (&tx->tx_list);
-                list_add (&tx->tx_list, &zombies);
-        }
+        CM_REQUEST_INFO  *req = &info->Info.Request;
+        CM_REPLY_INFO    *rep;
+        kib_conn_t       *conn;
+        FSTATUS           frc;
+        int               rc;
+        
+        LASSERT(arg == NULL); /* no conn yet for passive */
 
-        /* grab all blocked transmits */
-        list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
-                tx = list_entry (tmp, kib_tx_t, tx_list);
-                
-                list_del (&tx->tx_list);
-                list_add (&tx->tx_list, &zombies);
+        CDEBUG(D_NET, "%x\n", info->Status);
+        
+        if (info->Status == FCM_CONNECT_CANCEL) {
+                up(&kibnal_data.kib_listener_signal);
+                return;
         }
         
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+        LASSERT (info->Status == FCM_CONNECT_REQUEST);
 
-        while (!list_empty(&zombies)) {
-                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+        rc = kibnal_accept(&conn, (kib_msg_t *)req->PrivateData, 
+                           CM_REQUEST_INFO_USER_LEN);
+        if (rc != 0) {
+                kibnal_reject(LNET_NID_ANY, cep,
+                              (rc == -EPROTO) ? RC_USER_REJ : RC_NO_RESOURCES);
+                return;
+        }
 
-                list_del(&tx->tx_list);
-                kibnal_tx_done (tx);
+        conn->ibc_cvars->cv_path = req->PathInfo.Path;
+        
+        rc = kibnal_conn_rts(conn, 
+                             req->CEPInfo.QPN, 
+                             req->CEPInfo.OfferedInitiatorDepth,
+                             req->CEPInfo.OfferedResponderResources,
+                             req->CEPInfo.StartingPSN);
+        if (rc != 0) {
+                kibnal_reject(conn->ibc_peer->ibp_nid, cep, RC_NO_RESOURCES);
+                kibnal_connreq_done(conn, 0, -ECONNABORTED);
+                return;
         }
-}
 
-static void
-kibnal_reject (IB_HANDLE cep, uint16_t reason)
-{
-        CM_REJECT_INFO *rej;
+        memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci));
+        rep = &conn->ibc_cvars->cv_cmci.Info.Reply;
 
-        LIBCFS_ALLOC(rej, sizeof(*rej));
-        if (rej == NULL) /* LIBCFS_ALLOC() will CERROR on failure */
-                return;  
+        rep->QPN                   = conn->ibc_cvars->cv_qpattrs.QPNumber;
+        rep->QKey                  = conn->ibc_cvars->cv_qpattrs.Qkey;
+        rep->StartingPSN           = conn->ibc_cvars->cv_qpattrs.RecvPSN;
+        rep->EndToEndFlowControl   = conn->ibc_cvars->cv_qpattrs.FlowControl;
+        rep->ArbInitiatorDepth     = conn->ibc_cvars->cv_qpattrs.InitiatorDepth;
+        rep->ArbResponderResources = conn->ibc_cvars->cv_qpattrs.ResponderResources;
+        rep->TargetAckDelay        = kibnal_data.kib_hca_attrs.LocalCaAckDelay;
+        rep->FailoverAccepted      = IBNAL_FAILOVER_ACCEPTED;
+        rep->RnRRetryCount         = req->CEPInfo.RnrRetryCount;
+        
+        CLASSERT (CM_REPLY_INFO_USER_LEN >=
+                  offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
 
-        rej->Reason = reason;
-        iibt_cm_reject(cep, rej);
-        LIBCFS_FREE(rej, sizeof(*rej));
-}
+        kibnal_pack_connmsg((kib_msg_t *)rep->PrivateData,
+                            CM_REPLY_INFO_USER_LEN,
+                            IBNAL_MSG_CONNACK,
+                            conn->ibc_peer->ibp_nid, conn->ibc_incarnation);
 
-static FSTATUS
-kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, 
-              IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn)
-{
-        IB_QP_ATTRIBUTES_MODIFY modify_attr;
-        FSTATUS frc;
-        ENTRY;
-
-        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
-                .RequestState           = QPStateReadyToRecv,
-                .RecvPSN                = IBNAL_STARTING_PSN,
-                .DestQPNumber           = qpn,
-                .ResponderResources     = resp_res,
-                .MinRnrTimer            = UsecToRnrNakTimer(2000), /* 20 ms */
-                .Attrs                  = (IB_QP_ATTR_RECVPSN |
-                                           IB_QP_ATTR_DESTQPNUMBER | 
-                                           IB_QP_ATTR_RESPONDERRESOURCES | 
-                                           IB_QP_ATTR_DESTAV | 
-                                           IB_QP_ATTR_PATHMTU | 
-                                           IB_QP_ATTR_MINRNRTIMER),
-        };
-        GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, 
-                      &modify_attr.DestAV);
-
-        frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
-        if (frc != FSUCCESS) 
-                RETURN(frc);
-
-        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
-                .RequestState           = QPStateReadyToSend,
-                .FlowControl            = TRUE,
-                .InitiatorDepth         = init_depth,
-                .SendPSN                = send_psn,
-                .LocalAckTimeout        = path->PktLifeTime + 2, /* 2 or 1? */
-                .RetryCount             = IBNAL_RETRY,
-                .RnrRetryCount          = IBNAL_RNR_RETRY,
-                .Attrs                  = (IB_QP_ATTR_FLOWCONTROL | 
-                                           IB_QP_ATTR_INITIATORDEPTH | 
-                                           IB_QP_ATTR_SENDPSN | 
-                                           IB_QP_ATTR_LOCALACKTIMEOUT | 
-                                           IB_QP_ATTR_RETRYCOUNT | 
-                                           IB_QP_ATTR_RNRRETRYCOUNT),
-        };
+        LASSERT (conn->ibc_cep == NULL);
+        kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
+
+        frc = iibt_cm_accept(cep, 
+                             &conn->ibc_cvars->cv_cmci,
+                             NULL,
+                             kibnal_cm_passive_callback, conn, 
+                             &conn->ibc_cep);
 
-        frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
-        RETURN(frc);
+        if (frc == FSUCCESS || frc == FPENDING)
+                return;
+        
+        CERROR("iibt_cm_accept(%s) failed: %d\n", 
+               libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+        kibnal_connreq_done(conn, 0, -ECONNABORTED);
 }
 
-static void
-kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+void
+kibnal_check_connreply(kib_conn_t *conn, CM_REPLY_INFO *rep)
 {
-        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
-        kib_conn_t *conn = arg;
-        kib_wire_connreq_t *wcr;
-        CM_REPLY_INFO *rep = &info->Info.Reply;
-        uint16_t reason;
-        FSTATUS frc;
+        kib_msg_t   *msg = (kib_msg_t *)rep->PrivateData;
+        lnet_nid_t   nid = conn->ibc_peer->ibp_nid;
+        FSTATUS      frc;
+        int          rc;
 
-        wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData;
+        rc = kibnal_unpack_msg(msg, CM_REPLY_INFO_USER_LEN);
+        if (rc != 0) {
+                CERROR ("Error %d unpacking connack from %s\n",
+                        rc, libcfs_nid2str(nid));
+                kibnal_reject(nid, conn->ibc_cep, RC_USER_REJ);
+                kibnal_connreq_done(conn, 1, -EPROTO);
+                return;
+        }
+                        
+        if (msg->ibm_type != IBNAL_MSG_CONNACK) {
+                CERROR("Bad connack request type %d (%d expected) from %s\n",
+                       msg->ibm_type, IBNAL_MSG_CONNREQ,
+                       libcfs_nid2str(msg->ibm_srcnid));
+                kibnal_reject(nid, conn->ibc_cep, RC_USER_REJ);
+                kibnal_connreq_done(conn, 1, -EPROTO);
+                return;
+        }
 
-        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
-                CERROR ("Can't connect %s: bad magic %08x\n",
-                        libcfs_nid2str(conn->ibc_peer->ibp_nid), 
-                        le32_to_cpu(wcr->wcr_magic));
-                GOTO(reject, reason = RC_USER_REJ);
+        if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+            msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid ||
+            msg->ibm_dststamp != kibnal_data.kib_incarnation) {
+                CERROR("Stale connack from %s(%s): %s(%s), "LPX64"("LPX64")\n",
+                       libcfs_nid2str(msg->ibm_srcnid), 
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                       libcfs_nid2str(msg->ibm_dstnid),
+                       libcfs_nid2str(kibnal_data.kib_ni->ni_nid),
+                       msg->ibm_dststamp, kibnal_data.kib_incarnation);
+                kibnal_reject(nid, conn->ibc_cep, RC_USER_REJ);
+                kibnal_connreq_done(conn, 1, -EPROTO);
+                return;
         }
         
-        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
-                CERROR ("Can't connect %s: bad version %d\n",
-                        libcfs_nid2str(conn->ibc_peer->ibp_nid), 
-                        le16_to_cpu(wcr->wcr_magic));
-                GOTO(reject, reason = RC_USER_REJ);
-        }
-                        
-        if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
-                CERROR ("Can't connect %s: bad queue depth %d\n",
-                        libcfs_nid2str(conn->ibc_peer->ibp_nid), 
-                        le16_to_cpu(wcr->wcr_queue_depth));
-                GOTO(reject, reason = RC_USER_REJ);
+        if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE ||
+            msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE ||
+            msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
+                CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n",
+                       libcfs_nid2str(msg->ibm_srcnid), 
+                       msg->ibm_u.connparams.ibcp_queue_depth,
+                       msg->ibm_u.connparams.ibcp_max_msg_size,
+                       msg->ibm_u.connparams.ibcp_max_frags,
+                       IBNAL_MSG_QUEUE_SIZE,
+                       IBNAL_MSG_SIZE,
+                       IBNAL_MAX_RDMA_FRAGS);
+                kibnal_reject(nid, conn->ibc_cep, RC_USER_REJ);
+                kibnal_connreq_done(conn, 1, -EPROTO);
+                return;
         }
                         
-        if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
-                CERROR ("Unexpected NID %s from %s\n",
-                        libcfs_nid2str(le64_to_cpu(wcr->wcr_nid)), 
-                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                GOTO(reject, reason = RC_USER_REJ);
-        }
-
         CDEBUG(D_NET, "Connection %p -> %s REP_RECEIVED.\n",
                conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
-        conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
+        conn->ibc_incarnation = msg->ibm_srcstamp;
         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
 
-        frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, 
-                            min_t(__u8, rep->ArbInitiatorDepth,
-                                  ca_attr->MaxQPResponderResources),
-                            &conn->ibc_connreq->cr_path, 
-                            min_t(__u8, rep->ArbResponderResources,
-                                  ca_attr->MaxQPInitiatorDepth),
-                            rep->StartingPSN);
-        if (frc != FSUCCESS) {
-                CERROR("Connection %p -> %s QP RTS/RTR failed: %d\n",
-                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
-                GOTO(reject, reason = RC_NO_QP);
-        }
-
-        /* the callback arguments are ignored for an active accept */
-        conn->ibc_connreq->cr_discarded.Status = FSUCCESS;
-        frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, 
-                             NULL, NULL, NULL, NULL);
-        if (frc != FCM_CONNECT_ESTABLISHED) {
-                CERROR("Connection %p -> %s CMAccept failed: %d\n",
-                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
-                kibnal_connreq_done (conn, 1, -ECONNABORTED);
-                /* XXX don't call reject after accept fails? */
+        rc = kibnal_conn_rts(conn, 
+                             rep->QPN,
+                             rep->ArbInitiatorDepth,
+                             rep->ArbResponderResources,
+                             rep->StartingPSN);
+        if (rc != 0) {
+                kibnal_reject(nid, conn->ibc_cep, RC_NO_RESOURCES);
+                kibnal_connreq_done(conn, 1, -EIO);
                 return;
         }
 
-        CDEBUG(D_NET, "Connection %p -> %s Established\n",
-               conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+        memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci));
+        
+        frc = iibt_cm_accept(conn->ibc_cep, 
+                             &conn->ibc_cvars->cv_cmci, 
+                             NULL, NULL, NULL, NULL);
+
+        if (frc == FCM_CONNECT_ESTABLISHED)
+                kibnal_connreq_done(conn, 1, 0);
 
-        kibnal_connreq_done (conn, 1, 0);
-        return;
 
-reject:
-        kibnal_reject(cep, reason);
-        kibnal_connreq_done (conn, 1, -EPROTO);
+        CERROR("Connection %p -> %s CMAccept failed: %d\n",
+               conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+        kibnal_connreq_done(conn, 1, -ECONNABORTED);
 }
 
-/* ib_cm.h has a wealth of information on the CM procedures */
-static void
-kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+void
+kibnal_cm_active_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
 {
         kib_conn_t       *conn = arg;
 
         CDEBUG(D_NET, "status 0x%x\n", info->Status);
 
-        /* Established Connection Notifier */
         switch (info->Status) {
         default:
-                CERROR("unknown status %d on Connection %p -> %s\n",
+                CERROR("unknown status %d on Connection %p -> %s\n", 
                        info->Status, conn, 
                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 LBUG();
                 break;
 
-        case FCM_CONNECT_REPLY:
-                kibnal_connect_reply(cep, info, arg);
+        case FCM_CONNECT_TIMEOUT:
+                kibnal_connreq_done(conn, 1, -ETIMEDOUT);
+                break;
+                
+        case FCM_CONNECT_REJECT:
+                kibnal_check_connreject(conn, 1, &info->Info.Reject);
                 break;
 
-        case FCM_DISCONNECT_REQUEST:
-                /* XXX lock around these state management bits? */
-                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
-                        kibnal_close_conn (conn, 0);
-                conn->ibc_state = IBNAL_CONN_DREP;
-                iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+        case FCM_CONNECT_REPLY:
+                kibnal_check_connreply(conn, &info->Info.Reply);
                 break;
 
-        /* these both guarantee that no more cm callbacks will occur */
-        case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */
+        case FCM_DISCONNECT_REQUEST:
         case FCM_DISCONNECT_REPLY:
-                CDEBUG(D_NET, "Connection %p -> %s disconnect done.\n",
-                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
-
-                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
-                kibnal_flush_pending(conn);
-                kibnal_put_conn(conn);        /* Lose CM's ref */
+        case FCM_DISCONNECTED:
+                kibnal_cm_disconnect_callback(conn, info);
                 break;
         }
-
-        return;
-}
-
-static int
-kibnal_set_cm_flags(IB_HANDLE cep)
-{
-        FSTATUS frc;
-        uint32 value = 1;
-
-        frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
-                                 (char *)&value, sizeof(value), 0);
-        if (frc != FSUCCESS) {
-                CERROR("error setting timeout callback: %d\n", frc);
-                return -1;
-        }
-
-#if 0
-        frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
-                                 sizeof(value), 0);
-        if (frc != FSUCCESS) {
-                CERROR("error setting async accept: %d\n", frc);
-                return -1;
-        }
-#endif
-
-        return 0;
 }
 
 void
-kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
-{
-        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
-        IB_QP_ATTRIBUTES_QUERY *query;
-        CM_REQUEST_INFO    *req;
-        CM_CONN_INFO       *rep = NULL, *rcv = NULL;
-        kib_wire_connreq_t *wcr;
-        kib_conn_t         *conn = NULL;
-        uint16_t            reason = 0;
-        FSTATUS             frc;
-        int                 rc = 0;
-        
-        LASSERT(cep);
-        LASSERT(info);
-        LASSERT(arg == NULL); /* no conn yet for passive */
-
-        CDEBUG(D_NET, "status 0x%x\n", info->Status);
-
-        req = &info->Info.Request;
-        wcr = (kib_wire_connreq_t *)req->PrivateData;
-
-        CDEBUG(D_NET, "%d from %s\n", info->Status, 
-               libcfs_nid2str(le64_to_cpu(wcr->wcr_nid)));
-        
-        if (info->Status == FCM_CONNECT_CANCEL)
-                return;
-        
-        LASSERT (info->Status == FCM_CONNECT_REQUEST);
-        
-        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
-                CERROR ("Can't accept: bad magic %08x\n",
-                        le32_to_cpu(wcr->wcr_magic));
-                GOTO(out, reason = RC_USER_REJ);
-        }
-
-        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
-                CERROR ("Can't accept: bad version %d\n",
-                        le16_to_cpu(wcr->wcr_magic));
-                GOTO(out, reason = RC_USER_REJ);
-        }
-
-        rc = kibnal_accept(&conn, cep,
-                           le64_to_cpu(wcr->wcr_nid),
-                           le64_to_cpu(wcr->wcr_incarnation),
-                           le16_to_cpu(wcr->wcr_queue_depth));
-        if (rc != 0) {
-                CERROR ("Can't accept %s: %d\n",
-                        libcfs_nid2str(le64_to_cpu(wcr->wcr_nid)), rc);
-                GOTO(out, reason = RC_NO_RESOURCES);
-        }
-
-        frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN,
-                            min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, 
-                                  ca_attr->MaxQPResponderResources),
-                            &req->PathInfo.Path,
-                            min_t(__u8, req->CEPInfo.OfferedResponderResources, 
-                                  ca_attr->MaxQPInitiatorDepth),
-                            req->CEPInfo.StartingPSN);
-
-        if (frc != FSUCCESS) {
-                CERROR ("Can't mark QP RTS/RTR  %s: %d\n",
-                        libcfs_nid2str(le64_to_cpu(wcr->wcr_nid)), frc);
-                GOTO(out, reason = RC_NO_QP);
-        }
-
-        frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL);
-        if (frc != FSUCCESS) {
-                CERROR ("Couldn't query qp attributes %s: %d\n",
-                        libcfs_nid2str(le64_to_cpu(wcr->wcr_nid)), frc);
-                GOTO(out, reason = RC_NO_QP);
-        }
-        query = &conn->ibc_qp_attrs;
-
-        LIBCFS_ALLOC(rep, sizeof(*rep));
-        LIBCFS_ALLOC(rcv, sizeof(*rcv));
-        if (rep == NULL || rcv == NULL) {
-                if (rep) LIBCFS_FREE(rep, sizeof(*rep));
-                if (rcv) LIBCFS_FREE(rcv, sizeof(*rcv));
-                CERROR ("can't allocate reply and receive buffers\n");
-                GOTO(out, reason = RC_INSUFFICIENT_RESP_RES);
-        }
-
-        /* don't try to deref this into the incoming wcr :) */
-        wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData;
-
-        rep->Info.Reply = (CM_REPLY_INFO) {
-                .QPN = query->QPNumber,
-                .QKey = query->Qkey,
-                .StartingPSN = query->RecvPSN,
-                .EndToEndFlowControl = query->FlowControl,
-                /* XXX Hmm. */
-                .ArbInitiatorDepth = query->InitiatorDepth,
-                .ArbResponderResources = query->ResponderResources,
-                .TargetAckDelay = 0,
-                .FailoverAccepted = 0,
-                .RnRRetryCount = req->CEPInfo.RnrRetryCount,
-        };
-                
-        *wcr = (kib_wire_connreq_t) {
-                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
-                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
-                .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
-                .wcr_nid         = cpu_to_le64(kibnal_data.kib_ni->ni_nid),
-                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
-        };
-
-        frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, 
-                             &conn->ibc_cep);
-
-        LIBCFS_FREE(rep, sizeof(*rep));
-        LIBCFS_FREE(rcv, sizeof(*rcv));
-
-        if (frc != FCM_CONNECT_ESTABLISHED) {
-                /* XXX it seems we don't call reject after this point? */
-                CERROR("iibt_cm_accept() failed: %d, aborting\n", frc);
-                rc = -ECONNABORTED;
-                goto out;
-        }
-
-        if (kibnal_set_cm_flags(conn->ibc_cep)) {
-                rc = -ECONNABORTED;
-                goto out;
-        }
-
-        CDEBUG(D_WARNING, "Connection %p -> %s ESTABLISHED.\n",
-               conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
-
-out:
-        if (reason) {
-                kibnal_reject(cep, reason);
-                rc = -ECONNABORTED;
-        }
-        if (conn != NULL) 
-                kibnal_connreq_done(conn, 0, rc);
-
-        return;
-}
-
-static void
 dump_path_records(PATH_RESULTS *results)
 {
         IB_PATH_RECORD *path;
         int i;
 
-        for(i = 0; i < results->NumPathRecords; i++) {
+        for (i = 0; i < results->NumPathRecords; i++) {
                 path = &results->PathRecords[i];
                 CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid "
                        LPX64":"LPX64" pkey %x\n",
@@ -2423,113 +2428,88 @@ dump_path_records(PATH_RESULTS *results)
         }
 }
 
-static void
-kibnal_pathreq_callback (void *arg, QUERY *query, 
-                         QUERY_RESULT_VALUES *query_res)
+void
+kibnal_pathreq_callback (void *arg, QUERY *qry, 
+                         QUERY_RESULT_VALUES *qrslt)
 {
-        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
-        kib_conn_t *conn = arg;
-        PATH_RESULTS *path;
-        FSTATUS frc;
-        lnet_nid_t nid;
+        IB_CA_ATTRIBUTES  *ca_attr = &kibnal_data.kib_hca_attrs;
+        kib_conn_t        *conn = arg;
+        CM_REQUEST_INFO   *req = &conn->ibc_cvars->cv_cmci.Info.Request;
+        PATH_RESULTS      *path = (PATH_RESULTS *)qrslt->QueryResult;
+        FSTATUS            frc;
         
-        if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
-                CERROR ("status %d data size %d\n", query_res->Status,
-                        query_res->ResultDataSize);
-                kibnal_connreq_done (conn, 1, -EINVAL);
+        if (qrslt->Status != FSUCCESS || 
+            qrslt->ResultDataSize < sizeof(*path)) {
+                CERROR ("pathreq %s failed: status %d data size %d\n", 
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                        qrslt->Status, qrslt->ResultDataSize);
+                kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
                 return;
         }
 
-        path = (PATH_RESULTS *)query_res->QueryResult;
-
         if (path->NumPathRecords < 1) {
-                CERROR ("expected path records: %d\n", path->NumPathRecords);
-                kibnal_connreq_done (conn, 1, -EINVAL);
+                CERROR ("pathreq %s failed: no path records\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
                 return;
         }
 
-        dump_path_records(path);
+        //dump_path_records(path);
+        conn->ibc_cvars->cv_path = path->PathRecords[0];
 
-        /* just using the first.  this is probably a horrible idea. */
-        conn->ibc_connreq->cr_path = path->PathRecords[0];
+        LASSERT (conn->ibc_cep == NULL);
 
-        conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE);
+        conn->ibc_cep = kibnal_create_cep(conn->ibc_peer->ibp_nid);
         if (conn->ibc_cep == NULL) {
-                CERROR ("Can't create CEP\n");
-                kibnal_connreq_done (conn, 1, -EINVAL);
-                return;
-        }
-
-        if (kibnal_set_cm_flags(conn->ibc_cep)) {
-                kibnal_connreq_done (conn, 1, -EINVAL);
+                kibnal_connreq_done(conn, 1, -ENOMEM);
                 return;
         }
 
-        conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
-                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
-                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
-                .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
-                .wcr_nid         = cpu_to_le64(kibnal_data.kib_ni->ni_nid),
-                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
-        };
-
-        conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) {
-                .SID = conn->ibc_connreq->cr_service.RID.ServiceID,
-                .CEPInfo = (CM_CEP_INFO) { 
-                        .CaGUID = kibnal_data.kib_hca_guids[0],
-                        .EndToEndFlowControl = FALSE,
-                        .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID,
-                        .RetryCount = IBNAL_RETRY,
-                        .RnrRetryCount = IBNAL_RNR_RETRY,
-                        .AckTimeout = IBNAL_ACK_TIMEOUT,
-                        .StartingPSN = IBNAL_STARTING_PSN,
-                        .QPN = conn->ibc_qp_attrs.QPNumber,
-                        .QKey = conn->ibc_qp_attrs.Qkey,
-                        .OfferedResponderResources = ca_attr->MaxQPResponderResources,
-                        .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth,
-                },
-                .PathInfo = (CM_CEP_PATHINFO) {
-                        .bSubnetLocal = TRUE,
-                        .Path = conn->ibc_connreq->cr_path,
-                },
-        };
+        memset(req, 0, sizeof(*req));
+        req->SID                               = conn->ibc_cvars->cv_svcrec.RID.ServiceID;
+        req->CEPInfo.CaGUID                    = kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx];
+        req->CEPInfo.EndToEndFlowControl       = IBNAL_EE_FLOW;
+        req->CEPInfo.PortGUID                  = conn->ibc_cvars->cv_path.SGID.Type.Global.InterfaceID;
+        req->CEPInfo.RetryCount                = IBNAL_RETRY;
+        req->CEPInfo.RnrRetryCount             = IBNAL_RNR_RETRY;
+        req->CEPInfo.AckTimeout                = IBNAL_ACK_TIMEOUT;
+        req->CEPInfo.StartingPSN               = IBNAL_STARTING_PSN;
+        req->CEPInfo.QPN                       = conn->ibc_cvars->cv_qpattrs.QPNumber;
+        req->CEPInfo.QKey                      = conn->ibc_cvars->cv_qpattrs.Qkey;
+        req->CEPInfo.OfferedResponderResources = ca_attr->MaxQPResponderResources;
+        req->CEPInfo.OfferedInitiatorDepth     = ca_attr->MaxQPInitiatorDepth;
+        req->PathInfo.bSubnetLocal             = IBNAL_LOCAL_SUB;
+        req->PathInfo.Path                     = conn->ibc_cvars->cv_path;
+
+        CLASSERT (CM_REQUEST_INFO_USER_LEN >=
+                  offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
+
+        kibnal_pack_connmsg((kib_msg_t *)req->PrivateData, 
+                            CM_REQUEST_INFO_USER_LEN,
+                            IBNAL_MSG_CONNREQ, 
+                            conn->ibc_peer->ibp_nid, 0);
 
-#if 0
-        /* XXX set timeout just like SDP!!!*/
-        conn->ibc_connreq->cr_path.packet_life = 13;
-#endif
         /* Flag I'm getting involved with the CM... */
-        conn->ibc_state = IBNAL_CONN_CONNECTING;
-
-        nid = *kibnal_service_nid_field(&conn->ibc_connreq->cr_service);
-
-        CDEBUG(D_NET, "Connecting to, service id "LPX64", on %s\n",
-               conn->ibc_connreq->cr_service.RID.ServiceID, 
-               libcfs_nid2str(nid));
+        kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
 
-        memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, 
-               CM_REQUEST_INFO_USER_LEN);
-        memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, 
-               &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
-
-        /* kibnal_cm_callback gets my conn ref */
-        frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq,
-                              kibnal_cm_callback, conn);
-        if (frc != FPENDING && frc != FSUCCESS) {
-                CERROR ("Connect: %d\n", frc);
-                /* Back out state change as connect failed */
-                conn->ibc_state = IBNAL_CONN_INIT_QP;
-                kibnal_connreq_done (conn, 1, -EINVAL);
-        }
+        /* cm callback gets my conn ref */
+        frc = iibt_cm_connect(conn->ibc_cep, req, 
+                              kibnal_cm_active_callback, conn);
+        if (frc == FPENDING || frc == FSUCCESS)
+                return;
+        
+        CERROR ("Connect %s failed: %d\n", 
+                libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+        kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
 }
 
-static void
-dump_service_records(SERVICE_RECORD_RESULTS *results)
+void
+kibnal_dump_service_records(SERVICE_RECORD_RESULTS *results)
 {
         IB_SERVICE_RECORD *svc;
         int i;
 
-        for(i = 0; i < results->NumServiceRecords; i++) {
+        for (i = 0; i < results->NumServiceRecords; i++) {
                 svc = &results->ServiceRecords[i];
                 CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
                        i,
@@ -2540,76 +2520,71 @@ dump_service_records(SERVICE_RECORD_RESULTS *results)
         }
 }
 
-
-static void
-kibnal_service_get_callback (void *arg, QUERY *query, 
-                             QUERY_RESULT_VALUES *query_res)
+void
+kibnal_service_get_callback (void *arg, QUERY *qry, 
+                             QUERY_RESULT_VALUES *qrslt)
 {
-        kib_conn_t *conn = arg;
-        SERVICE_RECORD_RESULTS *svc;
-        COMMAND_CONTROL_PARAMETERS sd_params;
-        QUERY   path_query;
-        FSTATUS frc;
-        lnet_nid_t nid;
-        
-        if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
-                CERROR ("status %d data size %d\n", query_res->Status,
-                        query_res->ResultDataSize);
-                kibnal_connreq_done (conn, 1, -EINVAL);
+        kib_conn_t              *conn = arg;
+        SERVICE_RECORD_RESULTS  *svc;
+        FSTATUS                  frc;
+        lnet_nid_t               nid;
+
+        if (qrslt->Status != FSUCCESS || 
+            qrslt->ResultDataSize < sizeof(*svc)) {
+                CERROR ("Lookup %s failed: status %d data size %d\n", 
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                        qrslt->Status, qrslt->ResultDataSize);
+                kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
                 return;
         }
 
-        svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult;
-
+        svc = (SERVICE_RECORD_RESULTS *)qrslt->QueryResult;
         if (svc->NumServiceRecords < 1) {
-                CERROR ("%d service records\n", svc->NumServiceRecords);
-                kibnal_connreq_done (conn, 1, -EINVAL);
+                CERROR ("lookup %s failed: no service records\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
                 return;
         }
 
-        dump_service_records(svc);
-
-        conn->ibc_connreq->cr_service = svc->ServiceRecords[0];
-        nid = *kibnal_service_nid_field(&conn->ibc_connreq->cr_service);
-        
-        CDEBUG(D_NET, "Got status %d, service id "LPX64", on %s\n",
-               query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, 
-               libcfs_nid2str(nid));
+        //kibnal_dump_service_records(svc);
+        conn->ibc_cvars->cv_svcrec = svc->ServiceRecords[0];
 
-        memset(&path_query, 0, sizeof(path_query));
-        path_query.InputType = InputTypePortGuidPair;
-        path_query.OutputType = OutputTypePathRecord;
-        path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid;
-        path_query.InputValue.PortGuidPair.DestPortGuid  = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID;
+        qry = &conn->ibc_cvars->cv_query;
+        memset(qry, 0, sizeof(*qry));
 
-        memset(&sd_params, 0, sizeof(sd_params));
-        sd_params.RetryCount = IBNAL_RETRY;
-        sd_params.Timeout = 10 * 1000;   /* wait 10 seconds */
+        qry->OutputType = OutputTypePathRecord;
+        qry->InputType = InputTypePortGuidPair;
 
-        /* kibnal_service_get_callback gets my conn ref */
+        qry->InputValue.PortGuidPair.SourcePortGuid = 
+                kibnal_data.kib_port_guid;
+        qry->InputValue.PortGuidPair.DestPortGuid  = 
+                conn->ibc_cvars->cv_svcrec.RID.ServiceGID.Type.Global.InterfaceID;
 
+        /* kibnal_pathreq_callback gets my conn ref */
         frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
                                                     kibnal_data.kib_port_guid,
-                                                    &path_query, 
+                                                    qry, 
                                                     kibnal_pathreq_callback,
-                                                    &sd_params, conn);
+                                                    &kibnal_data.kib_sdretry,
+                                                    conn);
         if (frc == FPENDING)
                 return;
 
-        CERROR ("Path record request failed: %d\n", frc);
-        kibnal_connreq_done (conn, 1, -EINVAL);
+        CERROR ("pathreq %s failed: %d\n", 
+                libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+        kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
 }
 
-static void
+void
 kibnal_connect_peer (kib_peer_t *peer)
 {
-        COMMAND_CONTROL_PARAMETERS sd_params;
-        QUERY   query;
-        FSTATUS frc;
-        kib_conn_t  *conn = kibnal_create_conn();
+        QUERY                     *qry;
+        FSTATUS                    frc;
+        kib_conn_t                *conn;
 
         LASSERT (peer->ibp_connecting != 0);
 
+        conn = kibnal_create_conn(peer->ibp_nid);
         if (conn == NULL) {
                 CERROR ("Can't allocate conn\n");
                 kibnal_peer_connect_failed (peer, 1, -ENOMEM);
@@ -2617,59 +2592,49 @@ kibnal_connect_peer (kib_peer_t *peer)
         }
 
         conn->ibc_peer = peer;
-        kib_peer_addref(peer);
-
-        LIBCFS_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
-        if (conn->ibc_connreq == NULL) {
-                CERROR ("Can't allocate connreq\n");
-                kibnal_connreq_done (conn, 1, -ENOMEM);
-                return;
-        }
+        kibnal_peer_addref(peer);
 
-        memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+        qry = &conn->ibc_cvars->cv_query;
+        memset(qry, 0, sizeof(*qry));
 
-        kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+        qry->OutputType = OutputTypeServiceRecord;
+        qry->InputType = InputTypeServiceRecord;
 
-        memset(&query, 0, sizeof(query));
-        query.InputType = InputTypeServiceRecord;
-        query.OutputType = OutputTypeServiceRecord;
-        query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service;
-        query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
-
-        memset(&sd_params, 0, sizeof(sd_params));
-        sd_params.RetryCount = IBNAL_RETRY;
-        sd_params.Timeout = 10 * 1000;   /* wait 10 seconds */
+        qry->InputValue.ServiceRecordValue.ComponentMask = 
+                KIBNAL_SERVICE_KEY_MASK;
+        kibnal_set_service_keys(
+                &qry->InputValue.ServiceRecordValue.ServiceRecord, 
+                peer->ibp_nid);
 
         /* kibnal_service_get_callback gets my conn ref */
         frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
                                                     kibnal_data.kib_port_guid,
-                                                    &query, 
-                                                kibnal_service_get_callback, 
-                                                    &sd_params, conn);
+                                                    qry,
+                                                    kibnal_service_get_callback,
+                                                    &kibnal_data.kib_sdretry, 
+                                                    conn);
         if (frc == FPENDING)
                 return;
 
-        CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc);
-        kibnal_connreq_done (conn, 1, frc);
+        CERROR("Lookup %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), frc);
+        kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
 }
 
-static int
+int
 kibnal_conn_timed_out (kib_conn_t *conn)
 {
         kib_tx_t          *tx;
         struct list_head  *ttmp;
-        unsigned long      flags;
 
-        spin_lock_irqsave (&conn->ibc_lock, flags);
+        spin_lock(&conn->ibc_lock);
 
         list_for_each (ttmp, &conn->ibc_tx_queue) {
                 tx = list_entry (ttmp, kib_tx_t, tx_list);
 
-                LASSERT (!tx->tx_passive_rdma_wait);
-                LASSERT (tx->tx_sending == 0);
+                LASSERT (tx->tx_queued);
 
                 if (time_after_eq (jiffies, tx->tx_deadline)) {
-                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        spin_unlock(&conn->ibc_lock);
                         return 1;
                 }
         }
@@ -2677,24 +2642,21 @@ kibnal_conn_timed_out (kib_conn_t *conn)
         list_for_each (ttmp, &conn->ibc_active_txs) {
                 tx = list_entry (ttmp, kib_tx_t, tx_list);
 
-                LASSERT (tx->tx_passive_rdma ||
-                         !tx->tx_passive_rdma_wait);
-
-                LASSERT (tx->tx_passive_rdma_wait ||
+                LASSERT (!tx->tx_queued);
+                LASSERT (tx->tx_waiting ||
                          tx->tx_sending != 0);
 
                 if (time_after_eq (jiffies, tx->tx_deadline)) {
-                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        spin_unlock(&conn->ibc_lock);
                         return 1;
                 }
         }
 
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
+        spin_unlock(&conn->ibc_lock);
         return 0;
 }
 
-static void
+void
 kibnal_check_conns (int idx)
 {
         struct list_head  *peers = &kibnal_data.kib_peers[idx];
@@ -2716,7 +2678,7 @@ kibnal_check_conns (int idx)
                 list_for_each (ctmp, &peer->ibp_conns) {
                         conn = list_entry (ctmp, kib_conn_t, ibc_list);
 
-                        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+                        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
 
                         /* In case we have enough credits to return via a
                          * NOOP, but there were no non-blocking tx descs
@@ -2725,13 +2687,13 @@ kibnal_check_conns (int idx)
 
                         if (!kibnal_conn_timed_out(conn))
                                 continue;
+
+                        /* Handle timeout by closing the whole connection.  We
+                         * can only be sure RDMA activity has ceased once the
+                         * QP has been modified. */
                         
-                        CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
-                               conn, conn->ibc_state, 
-                               libcfs_nid2str(peer->ibp_nid),
-                               atomic_read (&conn->ibc_refcount));
+                        kibnal_conn_addref(conn); /* 1 ref for me... */
 
-                        atomic_inc (&conn->ibc_refcount);
                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
                                                flags);
 
@@ -2739,7 +2701,7 @@ kibnal_check_conns (int idx)
                                libcfs_nid2str(peer->ibp_nid));
 
                         kibnal_close_conn (conn, -ETIMEDOUT);
-                        kibnal_put_conn (conn);
+                        kibnal_conn_decref(conn); /* ...until here */
 
                         /* start again now I've dropped the lock */
                         goto again;
@@ -2749,37 +2711,32 @@ kibnal_check_conns (int idx)
         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 }
 
-static void
-kib_connd_handle_state(kib_conn_t *conn)
+void
+kibnal_disconnect_conn (kib_conn_t *conn)
 {
-        FSTATUS frc;
-
-        switch (conn->ibc_state) {
-                /* all refs have gone, free and be done with it */ 
-                case IBNAL_CONN_DISCONNECTED:
-                        kibnal_destroy_conn (conn);
-                        return; /* avoid put_conn */
-
-                case IBNAL_CONN_SEND_DREQ:
-                        frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
-                        if (frc != FSUCCESS) /* XXX do real things */
-                                CERROR("disconnect failed: %d\n", frc);
-                        conn->ibc_state = IBNAL_CONN_DREQ;
-                        break;
+        FSTATUS       frc;
 
-                /* a callback got to the conn before we did */ 
-                case IBNAL_CONN_DREP:
-                        break;
-                                
-                default:
-                        CERROR ("Bad conn %p state: %d\n", conn, 
-                                conn->ibc_state);
-                        LBUG();
-                        break;
-        }
+        LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTING);
 
-        /* drop ref from close_conn */
-        kibnal_put_conn(conn);
+        kibnal_conn_disconnected(conn);
+                
+        frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+        switch (frc) {
+        case FSUCCESS:
+                break;
+                
+        case FINSUFFICIENT_RESOURCES:
+                CERROR("ENOMEM disconnecting %s\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                /* This might cause the module to become unloadable since the
+                 * FCM_DISCONNECTED callback is still outstanding */
+                break;
+                
+        default:
+                CERROR("Unexpected error disconnecting %s: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+                LBUG();
+        }
 }
 
 int
@@ -2791,6 +2748,7 @@ kibnal_connd (void *arg)
         kib_peer_t        *peer;
         int                timeout;
         int                i;
+        int                did_something;
         int                peer_index = 0;
         unsigned long      deadline = jiffies;
         
@@ -2799,19 +2757,34 @@ kibnal_connd (void *arg)
 
         init_waitqueue_entry (&wait, current);
 
-        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+
+        while (!kibnal_data.kib_shutdown) {
+                did_something = 0;
+
+                if (!list_empty (&kibnal_data.kib_connd_zombies)) {
+                        conn = list_entry (kibnal_data.kib_connd_zombies.next,
+                                           kib_conn_t, ibc_list);
+                        list_del (&conn->ibc_list);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+                        did_something = 1;
+
+                        kibnal_destroy_conn(conn);
+
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                }
 
-        for (;;) {
                 if (!list_empty (&kibnal_data.kib_connd_conns)) {
                         conn = list_entry (kibnal_data.kib_connd_conns.next,
                                            kib_conn_t, ibc_list);
                         list_del (&conn->ibc_list);
-                        
                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
-                        kib_connd_handle_state(conn);
+                        did_something = 1;
 
+                        kibnal_disconnect_conn(conn);
+                        kibnal_conn_decref(conn);
+                        
                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
-                        continue;
                 }
 
                 if (!list_empty (&kibnal_data.kib_connd_peers)) {
@@ -2820,26 +2793,22 @@ kibnal_connd (void *arg)
                         
                         list_del_init (&peer->ibp_connd_list);
                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+                        did_something = 1;
 
                         kibnal_connect_peer (peer);
-                        kib_peer_decref (peer);
+                        kibnal_peer_decref (peer);
 
                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
                 }
 
-                /* shut down and nobody left to reap... */
-                if (kibnal_data.kib_shutdown &&
-                    atomic_read(&kibnal_data.kib_nconns) == 0)
-                        break;
-
-                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
-
                 /* careful with the jiffy wrap... */
                 while ((timeout = (int)(deadline - jiffies)) <= 0) {
                         const int n = 4;
                         const int p = 1;
                         int       chunk = kibnal_data.kib_peer_hash_size;
                         
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
                         /* Time to check for RDMA timeouts on a few more
                          * peers: I do checks every 'p' seconds on a
                          * proportion of the peer table and I need to check
@@ -2848,9 +2817,9 @@ kibnal_connd (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-                        if (kibnal_tunables.kib_io_timeout > n * p)
+                        if (*kibnal_tunables.kib_timeout > n * p)
                                 chunk = (chunk * n * p) / 
-                                        kibnal_tunables.kib_io_timeout;
+                                        *kibnal_tunables.kib_timeout;
                         if (chunk == 0)
                                 chunk = 1;
 
@@ -2861,9 +2830,14 @@ kibnal_connd (void *arg)
                         }
 
                         deadline += p * HZ;
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                        did_something = 1;
                 }
 
-                kibnal_data.kib_connd_waketime = jiffies + timeout;
+                if (did_something)
+                        continue;
+
+                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                 set_current_state (TASK_INTERRUPTIBLE);
                 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
@@ -2885,78 +2859,152 @@ kibnal_connd (void *arg)
         return (0);
 }
 
+
+void 
+kibnal_hca_async_callback (void *hca_arg, IB_EVENT_RECORD *ev)
+{
+        /* XXX flesh out.  this seems largely for async errors */
+        CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
+}
+
+void
+kibnal_hca_callback (void *hca_arg, void *cq_arg)
+{
+        unsigned long flags;
+
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+        kibnal_data.kib_ready = 1;
+        wake_up(&kibnal_data.kib_sched_waitq);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
 int
 kibnal_scheduler(void *arg)
 {
-        long            id = (long)arg;
-        char            name[16];
-        kib_rx_t       *rx;
-        kib_tx_t       *tx;
-        unsigned long   flags;
-        int             rc;
-        int             counter = 0;
-        int             did_something;
+        long               id = (long)arg;
+        wait_queue_t       wait;
+        char               name[16];
+        FSTATUS            frc;
+        FSTATUS            frc2;
+        IB_WORK_COMPLETION wc;
+        kib_rx_t          *rx;
+        kib_tx_t          *tx;
+        unsigned long      flags;
+        int                rc;
+        int                did_something;
+        __u64              rxseq = 0;
+        int                busy_loops = 0;
 
         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
         libcfs_daemonize(name);
         libcfs_blockallsigs();
 
-        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+        init_waitqueue_entry(&wait, current);
 
-        for (;;) {
-                did_something = 0;
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
 
-                while (!list_empty(&kibnal_data.kib_sched_txq)) {
-                        tx = list_entry(kibnal_data.kib_sched_txq.next,
-                                        kib_tx_t, tx_list);
-                        list_del(&tx->tx_list);
+        while (!kibnal_data.kib_shutdown) {
+                if (busy_loops++ >= IBNAL_RESCHED) {
                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
-                        kibnal_tx_done(tx);
 
-                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
-                                          flags);
+                        our_cond_resched();
+                        busy_loops = 0;
+                        
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
                 }
 
-                if (!list_empty(&kibnal_data.kib_sched_rxq)) {
-                        rx = list_entry(kibnal_data.kib_sched_rxq.next,
-                                        kib_rx_t, rx_list);
-                        list_del(&rx->rx_list);
+                if (kibnal_data.kib_ready &&
+                    !kibnal_data.kib_checking_cq) {
+                        /* take ownership of completion polling */
+                        kibnal_data.kib_checking_cq = 1;
+                        /* Assume I'll exhaust the CQ */
+                        kibnal_data.kib_ready = 0;
                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
+                        
+                        frc = iibt_cq_poll(kibnal_data.kib_cq, &wc);
+                        if (frc == FNOT_DONE) {
+                                /* CQ empty */
+                                frc2 = iibt_cq_rearm(kibnal_data.kib_cq,
+                                                     CQEventSelNextWC);
+                                LASSERT (frc2 == FSUCCESS);
+                        }
+                        
+                        if (frc == FSUCCESS &&
+                            kibnal_wreqid2type(wc.WorkReqId) == IBNAL_WID_RX) {
+                                rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.WorkReqId);
+                                
+                                /* Grab the RX sequence number NOW before
+                                 * anyone else can get an RX completion */
+                                rxseq = rx->rx_conn->ibc_rxseq++;
+                        }
+                                
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+                        /* give up ownership of completion polling */
+                        kibnal_data.kib_checking_cq = 0;
 
-                        kibnal_rx(rx);
+                        if (frc == FNOT_DONE)
+                                continue;
 
-                        did_something = 1;
-                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
-                                          flags);
-                }
+                        LASSERT (frc == FSUCCESS);
+                        /* Assume there's more: get another scheduler to check
+                         * while I handle this completion... */
 
-                /* shut down and no receives to complete... */
-                if (kibnal_data.kib_shutdown &&
-                    atomic_read(&kibnal_data.kib_nconns) == 0)
-                        break;
+                        kibnal_data.kib_ready = 1;
+                        wake_up(&kibnal_data.kib_sched_waitq);
 
-                /* nothing to do or hogging CPU */
-                if (!did_something || counter++ == IBNAL_RESCHED) {
                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
-                        counter = 0;
-
-                        if (!did_something) {
-                                rc = wait_event_interruptible(
-                                        kibnal_data.kib_sched_waitq,
-                                        !list_empty(&kibnal_data.kib_sched_txq) || 
-                                        !list_empty(&kibnal_data.kib_sched_rxq) || 
-                                        (kibnal_data.kib_shutdown &&
-                                         atomic_read (&kibnal_data.kib_nconns) == 0));
-                        } else {
-                                our_cond_resched();
-                        }
 
-                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
-                                          flags);
+                        switch (kibnal_wreqid2type(wc.WorkReqId)) {
+                        case IBNAL_WID_RX:
+                                kibnal_rx_complete(&wc, rxseq);
+                                break;
+                                
+                        case IBNAL_WID_TX:
+                                kibnal_tx_complete(&wc);
+                                break;
+                                
+                        case IBNAL_WID_RDMA:
+                                /* We only get RDMA completion notification if
+                                 * it fails.  So we just ignore them completely
+                                 * because...
+                                 *
+                                 * 1) If an RDMA fails, all subsequent work
+                                 * items, including the final SEND will fail
+                                 * too, so I'm still guaranteed to notice that
+                                 * this connection is hosed.
+                                 *
+                                 * 2) It's positively dangerous to look inside
+                                 * the tx descriptor obtained from an RDMA work
+                                 * item.  As soon as I drop the kib_sched_lock,
+                                 * I give a scheduler on another CPU a chance
+                                 * to get the final SEND completion, so the tx
+                                 * descriptor can get freed as I inspect it. */
+                                CERROR ("RDMA failed: %d\n", wc.Status);
+                                break;
+
+                        default:
+                                LBUG();
+                        }
+                        
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+                        continue;
                 }
+
+                /* Nothing to do; sleep... */
+
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
+                spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                       flags);
+
+                schedule();
+
+                remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
+                set_current_state(TASK_RUNNING);
+                spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
         }
 
         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
diff --git a/lnet/klnds/iiblnd/iiblnd_modparams.c b/lnet/klnds/iiblnd/iiblnd_modparams.c
new file mode 100644 (file)
index 0000000..ab9f0d3
--- /dev/null
@@ -0,0 +1,160 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iiblnd.h"
+
+static char *ipif_basename = IBNAL_IPIF_BASENAME;
+CFS_MODULE_PARM(ipif_basename, "s", charp, 0444,
+                "IPoIB interface base name");
+
+static char *service_name = IBNAL_SERVICE_NAME;
+CFS_MODULE_PARM(service_name, "s", charp, 0444,
+                "IB service name");
+
+static int service_number = IBNAL_SERVICE_NUMBER;
+CFS_MODULE_PARM(service_number, "i", int, 0444,
+                "IB service number");
+
+static int min_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+               "minimum connection retry interval (seconds)");
+
+static int max_reconnect_interval = IBNAL_MAX_RECONNECT_INTERVAL;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+               "maximum connection retry interval (seconds)");
+
+static int concurrent_peers = IBNAL_CONCURRENT_PEERS;
+CFS_MODULE_PARM(concurrent_peers, "i", int, 0444,
+               "maximum number of peers that may connect");
+
+static int cksum = IBNAL_CKSUM;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+               "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = IBNAL_TIMEOUT;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "timeout (seconds)");
+
+static int ntx = IBNAL_NTX;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of message descriptors");
+
+static int credits = IBNAL_CREDITS;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = IBNAL_PEERCREDITS;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# concurrent sends to 1 peer");
+
+static int sd_retries = IBNAL_SD_RETRIES;
+CFS_MODULE_PARM(sa_retries, "i", int, 0444,
+               "# times to retry SD queries");
+
+kib_tunables_t kibnal_tunables = {
+        .kib_ipif_basename          = &ipif_basename,
+        .kib_service_name           = &service_name,
+        .kib_service_number         = &service_number,
+        .kib_min_reconnect_interval = &min_reconnect_interval,
+        .kib_max_reconnect_interval = &max_reconnect_interval,
+        .kib_concurrent_peers       = &concurrent_peers,
+       .kib_cksum                  = &cksum,
+        .kib_timeout                = &timeout,
+        .kib_ntx                    = &ntx,
+        .kib_credits                = &credits,
+        .kib_peercredits            = &peer_credits,
+        .kib_sd_retries             = &sd_retries,
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+
+/* NB max_size specified for proc_dostring entries only needs to be big enough
+ * not to truncate the printout; it only needs to be the actual size of the
+ * string buffer if we allow writes (and we don't) */
+
+static ctl_table kibnal_ctl_table[] = {
+       {1, "ipif_basename", &ipif_basename, 
+         1024, 0444, NULL, &proc_dostring},
+       {2, "service_name", &service_name, 
+         1024, 0444, NULL, &proc_dostring},
+       {3, "service_number", &service_number, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {4, "min_reconnect_interval", &min_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {5, "max_reconnect_interval", &max_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {6, "concurrent_peers", &concurrent_peers, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {7, "cksum", &cksum, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {8, "timeout", &timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {9, "ntx", &ntx, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {10, "credits", &credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {11, "peer_credits", &peer_credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {12, "sd_retries", &sd_retries, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {0}
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+       {203, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
+       {0}
+};
+
+int
+kibnal_tunables_init ()
+{
+       kibnal_tunables.kib_sysctl =
+               register_sysctl_table(kibnal_top_ctl_table, 0);
+       
+       if (kibnal_tunables.kib_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+       if (kibnal_tunables.kib_sysctl != NULL)
+               unregister_sysctl_table(kibnal_tunables.kib_sysctl);
+}
+
+#else
+
+int
+kibnal_tunables_init ()
+{
+       return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+}
+
+#endif
index 54310c2..7a9da5e 100644 (file)
@@ -1672,7 +1672,7 @@ kibnal_module_init (void)
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel OpenIB NAL v1.00");
+MODULE_DESCRIPTION("Kernel OpenIB LND v1.00");
 MODULE_LICENSE("GPL");
 
 module_init(kibnal_module_init);
index a590115..b9324f9 100644 (file)
@@ -1645,7 +1645,7 @@ kranal_module_init (void)
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel RapidArray NAL v0.01");
+MODULE_DESCRIPTION("Kernel RapidArray LND v0.01");
 MODULE_LICENSE("GPL");
 
 module_init(kranal_module_init);
index 3009a84..c77096a 100644 (file)
@@ -2148,8 +2148,10 @@ ksocknal_enumerate_interfaces(ksock_net_t *net)
                 int        up;
                 __u32      ip;
                 __u32      mask;
+                
                 if (!strcmp(names[i], "lo")) /* skip the loopback IF */
                         continue;
+                
                 rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
                 if (rc != 0) {
                         CWARN("Can't get interface %s info: %d\n",
@@ -2283,7 +2285,7 @@ ksocknal_module_init (void)
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel TCP Socket NAL v1.0.0");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v1.0.0");
 MODULE_LICENSE("GPL");
 
 cfs_module(ksocknal, "1.0.0", ksocknal_module_init, ksocknal_module_fini);
index 14baf37..c4ad266 100644 (file)
@@ -1145,8 +1145,8 @@ kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
         switch(cmd) {
         case IOC_LIBCFS_GET_PEER: {
                 lnet_nid_t   nid = 0;
-                __u32       ip = 0;
-                int         share_count = 0;
+                __u32        ip = 0;
+                int          share_count = 0;
 
                 rc = kibnal_get_peer_info(data->ioc_count,
                                           &nid, &ip, &share_count);
@@ -1498,8 +1498,6 @@ kibnal_shutdown (lnet_ni_t *ni)
                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
                 }
                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
-                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
-                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
                 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
                 LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs));
@@ -1655,8 +1653,6 @@ kibnal_startup (lnet_ni_t *ni)
         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
 
         spin_lock_init (&kibnal_data.kib_sched_lock);
-        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
-        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
 
         spin_lock_init (&kibnal_data.kib_tx_lock);
@@ -1898,7 +1894,7 @@ kibnal_module_init (void)
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel Voltaire IB NAL v1.00");
+MODULE_DESCRIPTION("Kernel Voltaire IB LND v1.00");
 MODULE_LICENSE("GPL");
 
 module_init(kibnal_module_init);
index a894934..9fcac45 100644 (file)
@@ -223,7 +223,6 @@ typedef struct
         cm_cep_handle_t   kib_listen_handle;    /* IB listen handle */
 
         rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
-        spinlock_t        kib_vverbs_lock;      /* serialize vverbs calls */
         int               kib_ready;            /* CQ callback fired */
         int               kib_checking_cq;      /* a scheduler is checking the CQ */
         
@@ -241,8 +240,6 @@ typedef struct
         spinlock_t        kib_connd_lock;       /* serialise */
 
         wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
-        struct list_head  kib_sched_txq;        /* tx requiring attention */
-        struct list_head  kib_sched_rxq;        /* rx requiring attention */
         spinlock_t        kib_sched_lock;       /* serialise */
 
         struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
@@ -298,7 +295,7 @@ typedef struct kib_tx                           /* transmit message */
         int                       tx_status;    /* completion status */
         unsigned long             tx_deadline;  /* completion deadline */
         __u64                     tx_cookie;    /* completion cookie */
-        lnet_msg_t               *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
+        lnet_msg_t               *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
         vv_l_key_t                tx_lkey;      /* local key for message buffer */
         kib_msg_t                *tx_msg;       /* message buffer (host vaddr) */
         int                       tx_nwrq;      /* # send work items */
@@ -446,12 +443,6 @@ extern int  kibnal_init_rdma(kib_tx_t *tx, int type, int nob,
 extern int  kibnal_tunables_init(void);
 extern void kibnal_tunables_fini(void);
 
-static inline int
-wrq_signals_completion (vv_wr_t *wrq)
-{
-        return wrq->completion_notification != 0;
-}
-
 #define kibnal_conn_addref(conn)                                \
 do {                                                            \
         CDEBUG(D_NET, "conn[%p] (%d)++\n",                      \
index 05e3bd1..1c83872 100644 (file)
@@ -82,7 +82,6 @@ kibnal_get_idle_tx (void)
         
         spin_lock(&kibnal_data.kib_tx_lock);
 
-        /* "normal" descriptor is free */
         if (list_empty (&kibnal_data.kib_idle_txs)) {
                 spin_unlock(&kibnal_data.kib_tx_lock);
                 return NULL;
@@ -412,7 +411,7 @@ kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
         unsigned long flags;
         int           rc;
 
-        CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
         LASSERT (rx->rx_nob < 0);               /* was posted */
         rx->rx_nob = 0;                         /* isn't now */
 
@@ -834,7 +833,7 @@ void
 kibnal_check_sends (kib_conn_t *conn)
 {
         kib_tx_t       *tx;
-        vv_return_t     vvrc;                        
+        vv_return_t     vvrc;
         int             rc;
         int             done;
 
@@ -1370,7 +1369,7 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 
         /* NB 'private' is different depending on what we're sending.... */
 
-        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", 
+        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
                payload_nob, payload_niov, libcfs_id2str(target));
 
         LASSERT (payload_nob == 0 || payload_niov > 0);
@@ -1527,7 +1526,6 @@ kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                 if (nob <= IBNAL_MSG_SIZE)
                         break;                  /* send IMMEDIATE */
 
-                /* may block if caller is app thread */
                 tx = kibnal_get_idle_tx();
                 if (tx == NULL) {
                         CERROR("Can't allocate %s txd for %s\n",
@@ -1610,7 +1608,6 @@ kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
         int          rc = 0;
         
         LASSERT (mlen <= rlen);
-        LASSERT (mlen >= 0);
         LASSERT (!in_interrupt());
         /* Either all pages or all vaddrs */
         LASSERT (!(kiov != NULL && iov != NULL));
@@ -1749,7 +1746,7 @@ kibnal_schedule_conn (kib_conn_t *conn)
 void
 kibnal_close_conn_locked (kib_conn_t *conn, int error)
 {
-        /* This just does the immmediate housekeeping.  'error' is zero for a
+        /* This just does the immediate housekeeping.  'error' is zero for a
          * normal shutdown which can happen only after the connection has been
          * established.  If the connection is established, schedule the
          * connection to be finished off by the connd.  Otherwise the connd is
@@ -1782,7 +1779,6 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
                        conn->ibc_txseq, conn->ibc_rxseq);
-
 #if 0
                 /* can't skip down the queue without holding ibc_lock (see above) */
                 list_for_each(tmp, &conn->ibc_tx_queue) {
@@ -3012,7 +3008,7 @@ kibnal_check_conns (int idx)
                         
                         kibnal_conn_addref(conn); /* 1 ref for me... */
 
-                        read_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                        read_unlock_irqrestore(&kibnal_data.kib_global_lock,
                                                flags);
 
                         CERROR("Timed out RDMA with %s\n",
@@ -3095,7 +3091,7 @@ kibnal_connd (void *arg)
         init_waitqueue_entry (&wait, current);
         kibnal_data.kib_connd = current;
 
-        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
 
         while (!kibnal_data.kib_shutdown) {
 
index abc0782..7b4d613 100644 (file)
@@ -184,7 +184,7 @@ lnet_get_portals_compatibility(void)
         return 0;
 }
 
-# if LNET_SINGLE_THREADED
+# if !HAVE_LIBPTHREAD
 
 void lnet_init_locks(void)
 {
@@ -1044,7 +1044,7 @@ lnet_startup_lndnis (void)
                                 the_lnet.ln_eqwaitni = ni;
                         }
                 } else {
-# if LNET_SINGLE_THREADED
+# if !HAVE_LIBPTHREAD
                         LCONSOLE_ERROR("LND %s not supported in a "
                                        "single-threaded runtime\n",
                                        libcfs_lnd2str(lnd_type));
@@ -1096,7 +1096,7 @@ lnet_startup_lndnis (void)
 }
 
 #ifndef __KERNEL__
-# if LNET_SINGLE_THREADED
+# if !HAVE_LIBPTHREAD
 extern lnd_t the_ptllnd;
 # else
 extern lnd_t the_tcplnd;
@@ -1128,7 +1128,7 @@ LNetInit(void)
 #else
         /* Register all LNDs that have been loaded
          * NB the order here determines default 'networks=' order */
-# if LNET_SINGLE_THREADED
+# if !HAVE_LIBPTHREAD
         lnet_register_lnd(&the_ptllnd);
 # else
         lnet_register_lnd(&the_tcplnd);
index a8ebb08..46ff65a 100644 (file)
@@ -309,6 +309,7 @@ lnet_new_text_buf (int str_len)
        lnet_text_buf_t *ltb;
        int              nob;
 
+        /* NB allocate space for the terminating 0 */
        nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]);
        if (nob > LNET_SINGLE_TEXTBUF_NOB) {
                /* _way_ conservative for "route net gateway..." */
@@ -326,6 +327,7 @@ lnet_new_text_buf (int str_len)
                return NULL;
 
        ltb->ltb_size = nob;
+        ltb->ltb_text[0] = 0;
        lnet_tbnob += nob;
        return ltb;
 }
@@ -389,7 +391,7 @@ lnet_str2tbs_sep (struct list_head *tbs, char *str)
 
                nob = sep - str;
                if (nob > 0) {
-                       ltb = lnet_new_text_buf(nob + 1);
+                       ltb = lnet_new_text_buf(nob);
                        if (ltb == NULL) {
                                lnet_free_text_bufs(&pending);
                                return -1;
@@ -435,7 +437,7 @@ lnet_expand1tb (struct list_head *list,
        LASSERT (*sep1 == '[');
        LASSERT (*sep2 == ']');
 
-       ltb = lnet_new_text_buf(len1 + itemlen + len2 + 1);
+       ltb = lnet_new_text_buf(len1 + itemlen + len2);
        if (ltb == NULL)
                return -ENOMEM;
        
index 2c5390a..cafcb86 100644 (file)
@@ -179,7 +179,7 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
         cfs_waitlink_t   wl;
         cfs_time_t       now;
 #else
-# if !LNET_SINGLE_THREADED
+# if HAVE_LIBPTHREAD
         struct timeval   then;
         struct timeval   now;
         struct timespec  ts;
@@ -246,7 +246,7 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
                         continue;
                 }
 
-# if LNET_SINGLE_THREADED
+# if !HAVE_LIBPTHREAD
                 LNET_UNLOCK();
                 return -ENOENT;
 # else
index 21cecab..45c2afe 100644 (file)
@@ -57,7 +57,7 @@ lnet_enq_event_locked (lnet_eq_t *eq, lnet_event_t *ev)
         if (cfs_waitq_active(&the_lnet.ln_waitq))
                 cfs_waitq_broadcast(&the_lnet.ln_waitq);
 #else
-# if LNET_SINGLE_THREADED
+# if !HAVE_LIBPTHREAD
         /* LNetEQPoll() calls into _the_ LND to wait for action */
 # else
         /* Wake anyone waiting in LNetEQPoll() */
index c6c8b83..9fb4f35 100644 (file)
@@ -162,6 +162,8 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid)
 
        do_gettimeofday (&now);
 
+        memset(lp, 0, sizeof(*lp));             /* zero counters etc */
+        
        CFS_INIT_LIST_HEAD(&lp->lp_txq);
         CFS_INIT_LIST_HEAD(&lp->lp_rtrq);
        
index a414461..9872d18 100644 (file)
@@ -437,11 +437,11 @@ ptllnd_startup (lnet_ni_t *ni)
          */
         ni->ni_nid = ptl2lnetnid(ni,plni->plni_portals_id.nid);
 
-        PJK_UT_MSG("ptl  pid=" FMT_NID "\n",plni->plni_portals_id.pid);
+        PJK_UT_MSG("ptl  pid=" FMT_PID "\n",plni->plni_portals_id.pid);
         PJK_UT_MSG("ptl  nid=" FMT_NID "\n",plni->plni_portals_id.nid);
         PJK_UT_MSG("lnet nid=" LPX64 " (passed back)\n",ni->ni_nid);
 
-        CDEBUG(D_INFO,"ptl  pid=" FMT_NID "\n",plni->plni_portals_id.pid);
+        CDEBUG(D_INFO,"ptl  pid=" FMT_PID "\n",plni->plni_portals_id.pid);
         CDEBUG(D_INFO,"ptl  nid=" FMT_NID "\n",plni->plni_portals_id.nid);
         CDEBUG(D_INFO,"lnet nid=" LPX64 "\n",ni->ni_nid);
 
index 7397da1..bf9e33a 100644 (file)
@@ -46,7 +46,7 @@
 # include <sys/utsname.h>
 #endif
 
-#if LNET_SINGLE_THREADED
+#if !HAVE_LIBPTHREAD
 # error "This LND requires a multi-threaded runtime"
 #endif