* landing portals b1_4_sfw on HEAD

author eeb <eeb>

Thu, 23 Dec 2004 10:46:27 +0000 (10:46 +0000)

committer eeb <eeb>

Thu, 23 Dec 2004 10:46:27 +0000 (10:46 +0000)
author eeb <eeb>
Thu, 23 Dec 2004 10:46:27 +0000 (10:46 +0000)
committer eeb <eeb>
Thu, 23 Dec 2004 10:46:27 +0000 (10:46 +0000)
diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4

index 41f42b5..a20d639 100644 (file)
--- a/lnet/autoconf/lustre-lnet.m4
+++ b/lnet/autoconf/lustre-lnet.m4
@@ -187,6 +187,39 @@ AC_SUBST(IIBNAL)
  ])
  
  #
+# LP_CONFIG_VIB
+#
+# check for Voltaire infiniband support
+#
+AC_DEFUN([LP_CONFIG_VIB],
+[AC_MSG_CHECKING([if Voltaire IB kernel headers are present])
+VIBCPPFLAGS="-I/usr/local/include/ibhost-kdevel -DCPU_BE=0 -DCPU_LE=1 -DGSI_PASS_PORT_NUM"
+EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+EXTRA_KCFLAGS="$EXTRA_KCFLAGS $VIBCPPFLAGS"
+LB_LINUX_TRY_COMPILE([
+        #include <linux/list.h>
+       #include <vverbs.h>
+],[
+        vv_hca_h_t     kib_hca;
+       vv_return_t    retval;
+
+       retval = vv_hca_open("ANY_HCA", NULL, &kib_hca);
+
+       return retval == vv_return_ok ? 0 : 1;
+],[
+       AC_MSG_RESULT([yes])
+       VIBNAL="vibnal"
+],[
+       AC_MSG_RESULT([no])
+       VIBNAL=""
+       VIBCPPFLAGS=""
+])
+EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+AC_SUBST(VIBCPPFLAGS)
+AC_SUBST(VIBNAL)
+])
+
+#
  # LP_CONFIG_RANAL
  #
  # check whether to use the RapidArray nal
@@ -336,6 +369,7 @@ if test $linux25 = 'no' ; then
         LP_CONFIG_OPENIB
  fi
  LP_CONFIG_IIB
+LP_CONFIG_VIB
  LP_CONFIG_RANAL
  
  LP_STRUCT_PAGE_LIST
@@ -474,6 +508,7 @@ AC_DEFUN([LP_CONDITIONALS],
  AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal")
  AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal")
  AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal")
+AM_CONDITIONAL(BUILD_VIBNAL, test x$VIBNAL = "xvibnal")
  AM_CONDITIONAL(BUILD_RANAL, test x$RANAL = "xranal")
  ])
  
@@ -496,12 +531,14 @@ portals/knals/Makefile
  portals/knals/autoMakefile
  portals/knals/gmnal/Makefile
  portals/knals/gmnal/autoMakefile
+portals/knals/openibnal/Makefile
+portals/knals/openibnal/autoMakefile
  portals/knals/iibnal/Makefile
  portals/knals/iibnal/autoMakefile
+portals/knals/vibnal/Makefile
+portals/knals/vibnal/autoMakefile
  portals/knals/lonal/Makefile
  portals/knals/lonal/autoMakefile
-portals/knals/openibnal/Makefile
-portals/knals/openibnal/autoMakefile
  portals/knals/qswnal/Makefile
  portals/knals/qswnal/autoMakefile
  portals/knals/ranal/Makefile
diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h

index 85284ce..e590514 100644 (file)
--- a/lnet/include/linux/kp30.h
+++ b/lnet/include/linux/kp30.h
@@ -653,6 +653,7 @@ enum {
          IIBNAL    = 8,
          LONAL     = 9,
          RANAL     = 10,
+        VIBNAL    = 11,
          NAL_ENUM_END_MARKER
  };
  
diff --git a/lnet/klnds/Makefile.in b/lnet/klnds/Makefile.in

index 7e2e601..f494a30 100644 (file)
--- a/lnet/klnds/Makefile.in
+++ b/lnet/klnds/Makefile.in
@@ -2,6 +2,7 @@
  @BUILD_RANAL_TRUE@subdir-m += ranal
  @BUILD_OPENIBNAL_TRUE@subdir-m += openibnal
  @BUILD_IIBNAL_TRUE@subdir-m += iibnal
+@BUILD_VIBNAL_TRUE@subdir-m += vibnal
  @BUILD_QSWNAL_TRUE@subdir-m += qswnal
  subdir-m += socknal
  subdir-m += lonal
diff --git a/lnet/klnds/autoMakefile.am b/lnet/klnds/autoMakefile.am

index 4638188..d28e365 100644 (file)
--- a/lnet/klnds/autoMakefile.am
+++ b/lnet/klnds/autoMakefile.am
@@ -3,4 +3,4 @@
  # This code is issued under the GNU General Public License.
  # See the file COPYING in this distribution
  
-SUBDIRS = gmnal iibnal openibnal qswnal socknal lonal ranal
+SUBDIRS = lonal socknal qswnal gmnal openibnal iibnal vibnal ranal
diff --git a/lnet/klnds/iiblnd/iiblnd.c b/lnet/klnds/iiblnd/iiblnd.c

index 09908c9..e59d066 100644 (file)
--- a/lnet/klnds/iiblnd/iiblnd.c
+++ b/lnet/klnds/iiblnd/iiblnd.c
@@ -1381,7 +1381,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
          kibnal_data.kib_init = IBNAL_INIT_DATA;
          /*****************************************************/
  
-        process_id.pid = 0;
+        process_id.pid = requested_pid;
          process_id.nid = kibnal_data.kib_nid;
          
          rc = lib_init(&kibnal_lib, nal, process_id,
@@ -1690,7 +1690,7 @@ kibnal_module_init (void)
          }
  
          /* Pure gateways want the NAL started up at module load time... */
-        rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni);
+        rc = PtlNIInit(IIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
          if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
                  ptl_unregister_nal(IIBNAL);
                  return (-ENODEV);
diff --git a/lnet/klnds/iiblnd/iiblnd.h b/lnet/klnds/iiblnd/iiblnd.h

index 3242158..4f04542 100644 (file)
--- a/lnet/klnds/iiblnd/iiblnd.h
+++ b/lnet/klnds/iiblnd/iiblnd.h
@@ -247,7 +247,6 @@ typedef struct
  
  typedef struct
  {
-        __u32                 rd_key;           /* remote key */
          __u32                 rd_nob;           /* # of bytes */
          __u64                 rd_addr;          /* remote io vaddr */
  } kib_rdma_desc_t __attribute__((packed));
@@ -267,6 +266,7 @@ typedef struct
          ptl_hdr_t         ibrm_hdr;             /* portals header */
          __u64             ibrm_cookie;          /* opaque completion cookie */
          __u32             ibrm_num_descs;       /* how many descs */
+        __u32             rd_key;               /* remote key */
          kib_rdma_desc_t   ibrm_desc[0];         /* where to suck/blow */
  } kib_rdma_msg_t __attribute__((packed));
  
@@ -317,7 +317,7 @@ typedef struct kib_rx                           /* receive message */
          __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
          kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
          IB_WORK_REQ               rx_wrq;
-        IB_LOCAL_DATASEGMENT      rx_gl;        /* and it's memory */
+        IB_LOCAL_DATASEGMENT      rx_gl;        /* and its memory */
  } kib_rx_t;
  
  typedef struct kib_tx                           /* transmit message */
@@ -370,7 +370,6 @@ typedef struct kib_connreq
          IB_PATH_RECORD                      cr_path;
          CM_REQUEST_INFO                     cr_cmreq;
          CM_CONN_INFO                        cr_discarded;
-        CM_REJECT_INFO                      cr_rej_info;
  } kib_connreq_t;
  
  typedef struct kib_conn
diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c

index a827ba5..16ed937 100644 (file)
--- a/lnet/klnds/iiblnd/iiblnd_cb.c
+++ b/lnet/klnds/iiblnd/iiblnd_cb.c
@@ -485,17 +485,20 @@ kibnal_rx_callback (IB_WORK_COMPLETION *wc)
                          goto failed;
                  }
  
+                if (flipped) {
+                        __swab32(msg->ibm_u.rdma.rd_key);
+                }
+
                  for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
                          kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
  
                          if (flipped) {
-                                __swab32(desc->rd_key);
                                  __swab32(desc->rd_nob);
                                  __swab64(desc->rd_addr);
                          }
  
                          CDEBUG(D_NET, "  key %x, " "addr "LPX64", nob %u\n",
-                               desc->rd_key, desc->rd_addr, desc->rd_nob);
+                               msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob);
                  }
                  break;
                          
@@ -628,9 +631,9 @@ kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
  
          desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
          if (active)
-                desc->rd_key = kibnal_data.kib_md.md_lkey;
+                ibrm->rd_key = kibnal_data.kib_md.md_lkey;
          else
-                desc->rd_key = kibnal_data.kib_md.md_rkey;
+                ibrm->rd_key = kibnal_data.kib_md.md_rkey;
          desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
          desc->rd_addr = kibnal_page2phys(page) + page_offset +
                          kibnal_data.kib_md.md_addr;
@@ -845,7 +848,7 @@ kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
                  tx->tx_mapped = KIB_TX_MAPPED;
  #endif
          } else {
-                CERROR ("Can't map phys: %d\n", rc);
+                CERROR ("Can't map phys: %d\n", frc);
                  rc = -EFAULT;
          }
  
@@ -1090,6 +1093,10 @@ kibnal_ca_callback (void *ca_arg, void *cq_arg)
  
          for(;;) {
                  while (iibt_cq_poll(cq, &wc) == FSUCCESS) {
+
+                        /* We will need to rearm the CQ to avoid a potential race. */
+                        armed = 0;
+                        
                          if (kibnal_wreqid_is_rx(wc.WorkReqId))
                                  kibnal_rx_callback(&wc);
                          else
@@ -1306,7 +1313,7 @@ kibnal_start_passive_rdma (int type, ptl_nid_t nid,
          ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
          /* map_kiov alrady filled the rdma descs for the whole_mem case */
          if (!kibnal_whole_mem()) {
-                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey;
+                ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey;
                  ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
                  ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
                  ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
@@ -1408,7 +1415,7 @@ kibnal_start_active_rdma (int type, int status,
          } 
  
          if (!kibnal_whole_mem()) {
-                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey;
+                tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey;
                  tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
                  tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
                  tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
@@ -1439,7 +1446,7 @@ kibnal_start_active_rdma (int type, int status,
  
                  ds->Address = ldesc->rd_addr;
                  ds->Length  = ldesc->rd_nob;
-                ds->Lkey    = ldesc->rd_key;
+                ds->Lkey    = tx->tx_msg->ibm_u.rdma.rd_key;
  
                  memset(wrq, 0, sizeof(*wrq));
                  wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
@@ -1453,7 +1460,7 @@ kibnal_start_active_rdma (int type, int status,
                  wrq->Req.SendRC.Options.s.ImmediateData          = 0;
                  wrq->Req.SendRC.Options.s.Fence                  = 0;
                  wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr;
-                wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key;
+                wrq->Req.SendRC.RemoteDS.Rkey = rxmsg->ibm_u.rdma.rd_key;
  
                  /* only the last rdma post triggers tx completion */
                  if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
@@ -2394,7 +2401,9 @@ kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
          PORTAL_ALLOC(rep, sizeof(*rep));
          PORTAL_ALLOC(rcv, sizeof(*rcv));
          if (rep == NULL || rcv == NULL) {
-                CERROR ("can't reply and receive buffers\n");
+                if (rep) PORTAL_FREE(rep, sizeof(*rep));
+                if (rcv) PORTAL_FREE(rcv, sizeof(*rcv));
+                CERROR ("can't allocate reply and receive buffers\n");
                  GOTO(out, reason = RC_INSUFFICIENT_RESP_RES);
          }
  
diff --git a/lnet/klnds/ralnd/ralnd.c b/lnet/klnds/ralnd/ralnd.c

index 2873caa..02c3363 100644 (file)
--- a/lnet/klnds/ralnd/ralnd.c
+++ b/lnet/klnds/ralnd/ralnd.c
@@ -696,7 +696,7 @@ kranal_active_conn_handshake(kra_peer_t *peer, kra_conn_t **connp)
  
          /* spread connections over all devices using both peer NIDs to ensure
           * all nids use all devices */
-        idx = (peer->rap_nid + kranal_lib.libnal_ni.ni_pid.nid)
+        idx = peer->rap_nid + kranal_lib.libnal_ni.ni_pid.nid;
          dev = &kranal_data.kra_devices[idx % kranal_data.kra_ndevs];
  
          rc = kranal_create_conn(&conn, dev);
@@ -1550,7 +1550,7 @@ kranal_cmd(struct portals_cfg *pcfg, void * private)
                  else {
                          rc = 0;
                          pcfg->pcfg_nid   = conn->rac_peer->rap_nid;
-                        pcfg->pcfg_id    = 0;
+                        pcfg->pcfg_id    = conn->rac_device->rad_id;
                          pcfg->pcfg_misc  = 0;
                          pcfg->pcfg_flags = 0;
                          kranal_conn_decref(conn);
diff --git a/lnet/klnds/viblnd/.cvsignore b/lnet/klnds/viblnd/.cvsignore

new file mode 100644 (file)

index 0000000..5ed596b
--- /dev/null
+++ b/lnet/klnds/viblnd/.cvsignore
@@ -0,0 +1,10 @@
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
diff --git a/lnet/klnds/viblnd/Makefile.in b/lnet/klnds/viblnd/Makefile.in

new file mode 100644 (file)

index 0000000..fd7bb05
--- /dev/null
+++ b/lnet/klnds/viblnd/Makefile.in
@@ -0,0 +1,6 @@
+MODULES := kvibnal
+kvibnal-objs := vibnal.o vibnal_cb.o vibnal_sa.o
+
+EXTRA_POST_CFLAGS := @VIBCPPFLAGS@
+
+@INCLUDE_RULES@
diff --git a/lnet/klnds/viblnd/Makefile.mk b/lnet/klnds/viblnd/Makefile.mk

new file mode 100644 (file)

index 0000000..d08633a
--- /dev/null
+++ b/lnet/klnds/viblnd/Makefile.mk
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(src)/../../Kernelenv
+
+obj-y += kvibnal.o
+kvibnal-objs := vibnal.o vibnal_cb.o vibnal_sa.o
+
diff --git a/lnet/klnds/viblnd/autoMakefile.am b/lnet/klnds/viblnd/autoMakefile.am

new file mode 100644 (file)

index 0000000..eb65412
--- /dev/null
+++ b/lnet/klnds/viblnd/autoMakefile.am
@@ -0,0 +1,15 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if !CRAY_PORTALS
+if BUILD_VIBNAL
+modulenet_DATA = kvibnal$(KMODEXT)
+endif
+endif
+endif
+
+MOSTLYCLEANFILES = *.o *.ko *.mod.c
+DIST_SOURCES = $(kvibnal-objs:%.o=%.c) vibnal.h
diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c

new file mode 100644 (file)

index 0000000..50e1149
--- /dev/null
+++ b/lnet/klnds/viblnd/viblnd.c
@@ -0,0 +1,1693 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Frank Zago <fzago@systemfabricworks.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "vibnal.h"
+
+nal_t                   kibnal_api;
+ptl_handle_ni_t         kibnal_ni;
+kib_tunables_t          kibnal_tunables;
+
+kib_data_t              kibnal_data = {
+        .kib_service_id = IBNAL_SERVICE_NUMBER,
+};
+
+#ifdef CONFIG_SYSCTL
+#define IBNAL_SYSCTL             202
+
+#define IBNAL_SYSCTL_TIMEOUT     1
+
+static ctl_table kibnal_ctl_table[] = {
+        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
+         &kibnal_tunables.kib_io_timeout, sizeof (int),
+         0644, NULL, &proc_dointvec},
+        { 0 }
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+        {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
+        { 0 }
+};
+#endif
+
+#ifdef unused
+void
+print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+{
+        char name[32];
+
+        if (service == NULL) 
+        {
+                CWARN("tag       : %s\n"
+                      "status    : %d (NULL)\n", tag, rc);
+                return;
+        }
+        strncpy (name, service->ServiceName, sizeof(name)-1);
+        name[sizeof(name)-1] = 0;
+        
+        CWARN("tag       : %s\n"
+              "status    : %d\n"
+              "service id: "LPX64"\n"
+              "name      : %s\n"
+              "NID       : "LPX64"\n", tag, rc,
+              service->RID.ServiceID, name,
+              *kibnal_service_nid_field(service));
+}
+#endif
+
+/* 
+ * method is SUBN_ADM_SET, SUBN_ADM_GET, SUBN_ADM_DELETE. Tables not supported.
+ * nid is the nid to advertize/query/unadvertize
+ */
+static void fill_sa_request(struct sa_request *request, int method, ptl_nid_t nid)
+{
+        gsi_dtgrm_t *dtgrm = request->dtgrm_req;
+        sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad;
+        ib_service_record_v2_t *sr = (ib_service_record_v2_t *) mad->payload;
+        
+        memset(mad, 0, MAD_BLOCK_SIZE);
+
+        request->mad = mad;
+
+        dtgrm->rlid = kibnal_data.kib_port_attr.port_sma_address_info.sm_lid;
+        dtgrm->sl = kibnal_data.kib_port_attr.port_sma_address_info.service_level;
+
+        mad->hdr.base_ver = MAD_IB_BASE_VERSION;
+        mad->hdr.class = MAD_CLASS_SUBN_ADM;
+        mad->hdr.class_ver = 2;
+        mad->hdr.m.ms.method = method;
+        mad->hdr.attrib_id = SA_SERVICE_RECORD; /* something(?) will swap that field */
+
+               /* Note: the transaction ID is set by the Voltaire stack if it is 0. */
+
+        /* TODO: change the 40 to sizeof(something) */
+        mad->payload_len = cpu_to_be32(0x40 /*header size */  +
+                                       sizeof (ib_service_record_v2_t));
+
+
+        mad->component_mask = cpu_to_be64(
+                                          (1ull << 0)  |       /* service_id       */
+                                          (1ull << 2)  |       /* service_pkey     */
+                                          (1ull << 6)  |       /* service_name     */
+                                          (1ull << 7)  |       /* service_data8[0] */
+                                          (1ull << 8)  |       /* service_data8[1] */
+                                          (1ull << 9)  |       /* service_data8[2] */
+                                          (1ull << 10) |       /* service_data8[3] */
+                                          (1ull << 11) |       /* service_data8[4] */
+                                          (1ull << 12) |       /* service_data8[5] */
+                                          (1ull << 13) |       /* service_data8[6] */
+                                          (1ull << 14)      /* service_data8[7] */
+                                          );
+
+        sr->service_id = cpu_to_be64(kibnal_data.kib_service_id);
+        sr->service_pkey = cpu_to_be16(kibnal_data.kib_port_pkey);
+
+        /* Set the service name and the data (bytes 0 to 7) in data8 */
+        kibnal_set_service_keys(sr, nid);
+
+        if (method == SUBN_ADM_SET) {
+                mad->component_mask |= cpu_to_be64(
+                                                   (1ull << 1) |       /* service_gid       */
+                                                   (1ull << 4)         /* service_lease     */
+                                                   );
+
+                sr->service_gid = kibnal_data.kib_port_gid;
+                gid_swap(&sr->service_gid);
+                sr->service_lease = cpu_to_be32(0xffffffff);
+        }
+
+        CDEBUG(D_NET, "SA request %02x for service id "LPX64" %s:"LPX64"\n",
+               mad->hdr.m.ms.method,
+               sr->service_id, 
+               sr->service_name,
+               *kibnal_service_nid_field(sr));
+}
+
+/* Do an advertizement operation: 
+ *   SUBN_ADM_GET = 0x01 (i.e. query),
+ *   SUBN_ADM_SET = 0x02 (i.e. advertize),
+ *   SUBN_ADM_DELETE = 0x15 (i.e. un-advertize).
+ * If callback is NULL, the function is synchronous (and context is ignored).
+ */
+int kibnal_advertize_op(ptl_nid_t nid, int op, sa_request_cb_t callback, void *context)
+{
+        struct sa_request *request;
+        int ret;
+
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        CDEBUG(D_NET, "kibnal_advertize_op: nid="LPX64", op=%d\n", nid, op);
+
+        request = alloc_sa_request();
+        if (request == NULL) {
+                CERROR("Cannot allocate a SA request");
+                return -ENOMEM;
+        }
+                
+        fill_sa_request(request, op, nid);
+
+        if (callback) {
+                request->callback = callback;
+                request->context = context;
+        } else {
+                init_completion(&request->signal);
+        }
+
+        ret = vibnal_start_sa_request(request);
+        if (ret) {
+                CERROR("vibnal_send_sa failed: %d\n", ret);
+                free_sa_request(request);
+        } else {
+                if (callback) {
+                        /* Return. The callback will have to free the SA request. */
+                        ret = 0;
+                } else {
+                        wait_for_completion(&request->signal);
+
+                        ret = request->status;
+
+                        if (ret != 0) {
+                                CERROR ("Error %d in advertising operation %d for NID "LPX64"\n",
+                                        ret, op, kibnal_data.kib_nid);
+                        }
+                        
+                        free_sa_request(request);
+                }
+        }
+
+        return ret;
+}
+
+static int
+kibnal_set_mynid(ptl_nid_t nid)
+{
+        struct timeval tv;
+        lib_ni_t      *ni = &kibnal_lib.libnal_ni;
+        int            rc;
+        vv_return_t    retval;
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+               nid, ni->ni_pid.nid);
+
+        do_gettimeofday(&tv);
+
+        down (&kibnal_data.kib_nid_mutex);
+
+        if (nid == kibnal_data.kib_nid) {
+                /* no change of NID */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+
+        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
+               kibnal_data.kib_nid, nid);
+
+        /* Unsubscribes the current NID */
+        if (kibnal_data.kib_nid != PTL_NID_ANY) {
+
+                rc = kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_DELETE, NULL, NULL);
+
+                if (rc) {
+                        CERROR("Error %d unadvertising NID "LPX64"\n",
+                               rc, kibnal_data.kib_nid);
+                }
+        }
+        
+        kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+        /* Destroys the current endpoint, if any. */
+        if (kibnal_data.kib_cep) {
+                retval = cm_cancel(kibnal_data.kib_cep);
+                if (retval)
+                        CERROR ("Error %d stopping listener\n", retval);
+        
+                retval = cm_destroy_cep(kibnal_data.kib_cep);
+                if (retval)
+                        CERROR ("Error %d destroying CEP\n", retval);
+        
+                kibnal_data.kib_cep = NULL;
+        }
+        
+        /* Delete all existing peers and their connections after new
+         * NID/incarnation set to ensure no old connections in our brave
+         * new world. */
+        kibnal_del_peer (PTL_NID_ANY, 0);
+
+        if (kibnal_data.kib_nid == PTL_NID_ANY) {
+                /* No new NID to install. The driver is shuting down. */
+                up (&kibnal_data.kib_nid_mutex);
+                return (0);
+        }
+
+        /* remove any previous advert (crashed node etc) */
+        kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_DELETE, NULL, NULL);
+
+        kibnal_data.kib_cep = cm_create_cep(cm_cep_transp_rc);
+        if (kibnal_data.kib_cep == NULL) {
+                CERROR ("Can't create CEP\n");
+                rc = -ENOMEM;
+        } else {
+                cm_return_t cmret;
+                cm_listen_data_t info;
+
+                CDEBUG(D_NET, "Created CEP %p for listening\n", kibnal_data.kib_cep);
+
+                memset(&info, 0, sizeof(info));
+                info.listen_addr.end_pt.sid = kibnal_data.kib_service_id;
+
+                cmret = cm_listen(kibnal_data.kib_cep, &info,
+                                  kibnal_listen_callback, NULL);
+                if (cmret) {
+                        CERROR ("cm_listen error: %d\n", cmret);
+                        rc = -EINVAL;
+                } else {
+                        rc = 0;
+                }
+        }
+        
+        if (rc == 0) {
+                rc = kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_SET, NULL, NULL);
+                if (rc == 0) {
+#ifdef IBNAL_CHECK_ADVERT
+                        kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_GET, NULL, NULL);
+#endif
+                        up (&kibnal_data.kib_nid_mutex);
+                        return (0);
+                }
+                
+                retval = cm_cancel (kibnal_data.kib_cep);
+                if (retval)
+                        CERROR("cm_cancel failed: %d\n", retval);
+
+                retval = cm_destroy_cep (kibnal_data.kib_cep);
+                if (retval)
+                        CERROR("cm_destroy_cep failed: %d\n", retval);
+
+                /* remove any peers that sprung up while I failed to
+                 * advertise myself */
+                kibnal_del_peer (PTL_NID_ANY, 0);
+        }
+
+        kibnal_data.kib_nid = PTL_NID_ANY;
+        up (&kibnal_data.kib_nid_mutex);
+        return (rc);
+}
+
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
+{
+        kib_peer_t *peer;
+
+        LASSERT (nid != PTL_NID_ANY);
+
+        PORTAL_ALLOC(peer, sizeof (*peer));
+        if (peer == NULL) {
+                CERROR("Canot allocate perr\n");
+                return (NULL);
+        }
+
+        memset(peer, 0, sizeof(*peer));         /* zero flags etc */
+
+        peer->ibp_nid = nid;
+        atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
+
+        INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
+        INIT_LIST_HEAD (&peer->ibp_conns);
+        INIT_LIST_HEAD (&peer->ibp_tx_queue);
+
+        peer->ibp_reconnect_time = jiffies;
+        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+        atomic_inc (&kibnal_data.kib_npeers);
+        return (peer);
+}
+
+void
+kibnal_destroy_peer (kib_peer_t *peer)
+{
+
+        LASSERT (atomic_read (&peer->ibp_refcount) == 0);
+        LASSERT (peer->ibp_persistence == 0);
+        LASSERT (!kibnal_peer_active(peer));
+        LASSERT (peer->ibp_connecting == 0);
+        LASSERT (list_empty (&peer->ibp_conns));
+        LASSERT (list_empty (&peer->ibp_tx_queue));
+        
+        PORTAL_FREE (peer, sizeof (*peer));
+
+        /* NB a peer's connections keep a reference on their peer until
+         * they are destroyed, so we can be assured that _all_ state to do
+         * with this peer has been cleaned up when its refcount drops to
+         * zero. */
+        atomic_dec (&kibnal_data.kib_npeers);
+}
+
+/* the caller is responsible for accounting for the additional reference
+ * that this creates */
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
+{
+        struct list_head *peer_list = kibnal_nid2peerlist (nid);
+        struct list_head *tmp;
+        kib_peer_t       *peer;
+
+        list_for_each (tmp, peer_list) {
+
+                peer = list_entry (tmp, kib_peer_t, ibp_list);
+
+                LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
+                         peer->ibp_connecting != 0 || /* creating conns */
+                         !list_empty (&peer->ibp_conns));  /* active conn */
+
+                if (peer->ibp_nid != nid)
+                        continue;
+
+                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
+                       peer, nid, atomic_read (&peer->ibp_refcount));
+                return (peer);
+        }
+        return (NULL);
+}
+
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
+{
+        kib_peer_t     *peer;
+
+        read_lock (&kibnal_data.kib_global_lock);
+        peer = kibnal_find_peer_locked (nid);
+        if (peer != NULL)                       /* +1 ref for caller? */
+                kib_peer_addref(peer);
+        read_unlock (&kibnal_data.kib_global_lock);
+
+        return (peer);
+}
+
+void
+kibnal_unlink_peer_locked (kib_peer_t *peer)
+{
+        LASSERT (peer->ibp_persistence == 0);
+        LASSERT (list_empty(&peer->ibp_conns));
+
+        LASSERT (kibnal_peer_active(peer));
+        list_del_init (&peer->ibp_list);
+        /* lose peerlist's ref */
+        kib_peer_decref(peer);
+}
+
+static int
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+{
+        kib_peer_t        *peer;
+        struct list_head  *ptmp;
+        int                i;
+
+        read_lock (&kibnal_data.kib_global_lock);
+
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (index-- > 0)
+                                continue;
+
+                        *nidp = peer->ibp_nid;
+                        *persistencep = peer->ibp_persistence;
+
+                        read_unlock (&kibnal_data.kib_global_lock);
+                        return (0);
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+        return (-ENOENT);
+}
+
+static int
+kibnal_add_persistent_peer (ptl_nid_t nid)
+{
+        unsigned long      flags;
+        kib_peer_t        *peer;
+        kib_peer_t        *peer2;
+        
+        if (nid == PTL_NID_ANY)
+                return (-EINVAL);
+
+        peer = kibnal_create_peer (nid);
+        if (peer == NULL)
+                return (-ENOMEM);
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        peer2 = kibnal_find_peer_locked (nid);
+        if (peer2 != NULL) {
+                kib_peer_decref (peer);
+                peer = peer2;
+        } else {
+                /* peer table takes existing ref on peer */
+                list_add_tail (&peer->ibp_list,
+                               kibnal_nid2peerlist (nid));
+        }
+
+        peer->ibp_persistence++;
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        return (0);
+}
+
+static void
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+{
+        struct list_head *ctmp;
+        struct list_head *cnxt;
+        kib_conn_t       *conn;
+
+        if (!single_share)
+                peer->ibp_persistence = 0;
+        else if (peer->ibp_persistence > 0)
+                peer->ibp_persistence--;
+
+        if (peer->ibp_persistence != 0)
+                return;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                kibnal_close_conn_locked (conn, 0);
+        }
+
+        /* NB peer unlinks itself when last conn is closed */
+}
+
+int
+kibnal_del_peer (ptl_nid_t nid, int single_share)
+{
+        unsigned long      flags;
+        struct list_head  *ptmp;
+        struct list_head  *pnxt;
+        kib_peer_t        *peer;
+        int                lo;
+        int                hi;
+        int                i;
+        int                rc = -ENOENT;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        if (nid != PTL_NID_ANY)
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+        else {
+                lo = 0;
+                hi = kibnal_data.kib_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+                                continue;
+
+                        kibnal_del_peer_locked (peer, single_share);
+                        rc = 0;         /* matched something */
+
+                        if (single_share)
+                                goto out;
+                }
+        }
+ out:
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        return (rc);
+}
+
+static kib_conn_t *
+kibnal_get_conn_by_idx (int index)
+{
+        kib_peer_t        *peer;
+        struct list_head  *ptmp;
+        kib_conn_t        *conn;
+        struct list_head  *ctmp;
+        int                i;
+
+        read_lock (&kibnal_data.kib_global_lock);
+
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence > 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        list_for_each (ctmp, &peer->ibp_conns) {
+                                if (index-- > 0)
+                                        continue;
+
+                                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+                                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                                       atomic_read (&conn->ibc_refcount));
+                                atomic_inc (&conn->ibc_refcount);
+                                read_unlock (&kibnal_data.kib_global_lock);
+                                return (conn);
+                        }
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+        return (NULL);
+}
+
+kib_conn_t *
+kibnal_create_conn (void)
+{
+        kib_conn_t  *conn;
+        int          i;
+        __u64        vaddr = 0;
+        __u64        vaddr_base;
+        int          page_offset;
+        int          ipage;
+        vv_qp_attr_t qp_attr;
+        vv_return_t  retval;
+        int          rc;
+        void        *qp_context;
+        
+        PORTAL_ALLOC(conn, sizeof (*conn));
+        if (conn == NULL) {
+                CERROR ("Can't allocate connection\n");
+                return (NULL);
+        }
+
+        /* zero flags, NULL pointers etc... */
+        memset (conn, 0, sizeof (*conn));
+
+        INIT_LIST_HEAD (&conn->ibc_tx_queue);
+        INIT_LIST_HEAD (&conn->ibc_active_txs);
+        spin_lock_init (&conn->ibc_lock);
+        
+        atomic_inc (&kibnal_data.kib_nconns);
+        /* well not really, but I call destroy() on failure, which decrements */
+
+        PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+        if (conn->ibc_rxs == NULL) {
+                CERROR("Cannot allocate RX buffers\n");
+                goto failed;
+        }
+        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+        rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
+        if (rc != 0)
+                goto failed;
+
+        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
+
+        for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+                struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+                kib_rx_t   *rx = &conn->ibc_rxs[i];
+
+                rx->rx_conn = conn;
+                rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
+                             page_offset);
+
+                if (kibnal_whole_mem()) {
+                        void *newaddr;
+                        vv_mem_reg_h_t mem_h;
+                        vv_r_key_t r_key;
+
+                        /* Voltaire stack already registers the whole
+                         * memory, so use that API. */
+                        retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+                                                      rx->rx_msg,
+                                                      IBNAL_MSG_SIZE,
+                                                      &mem_h,
+                                                      &rx->l_key,
+                                                      &r_key);
+                        if (retval) {
+                                CERROR("vv_get_gen_mr_attrib failed: %d", retval);
+                                /* TODO: free pages? */
+                                goto failed;
+                        }
+                }
+                
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
+                
+                page_offset += IBNAL_MSG_SIZE;
+                LASSERT (page_offset <= PAGE_SIZE);
+
+                if (page_offset == PAGE_SIZE) {
+                        page_offset = 0;
+                        ipage++;
+                        LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
+                }
+        }
+
+        qp_attr = (vv_qp_attr_t) {
+                .create.qp_type          = vv_qp_type_r_conn,
+                .create.cq_send_h        = kibnal_data.kib_cq,
+                .create.cq_receive_h     = kibnal_data.kib_cq,
+                .create.send_max_outstand_wr = IBNAL_TX_MAX_SG * 
+                                           IBNAL_MSG_QUEUE_SIZE,
+                .create.receive_max_outstand_wr = IBNAL_MSG_QUEUE_SIZE,
+                .create.max_scatgat_per_send_wr = 1,
+                .create.max_scatgat_per_receive_wr = 1,
+                .create.signaling_type   = vv_selectable_signaling, /* TODO: correct? */
+                .create.pd_h             = kibnal_data.kib_pd,
+                .create.recv_solicited_events = vv_signal_all,
+        };
+        retval = vv_qp_create(kibnal_data.kib_hca, &qp_attr, NULL,
+                              &conn->ibc_qp, &conn->ibc_qp_attrs);
+        if (retval != 0) {
+                CERROR ("Failed to create queue pair: %d\n", retval);
+                goto failed;
+        }
+
+        /* Mark QP created */
+        conn->ibc_state = IBNAL_CONN_INIT_QP;
+
+        qp_attr = (vv_qp_attr_t) {
+                .modify.qp_modify_into_state = vv_qp_state_init,
+                .modify.vv_qp_attr_mask      = VV_QP_AT_STATE | VV_QP_AT_PHY_PORT_NUM | VV_QP_AT_P_KEY_IX | VV_QP_AT_ACCESS_CON_F,
+                .modify.qp_type              = vv_qp_type_r_conn,
+
+                .modify.params.init.p_key_indx      = 0,
+                .modify.params.init.phy_port_num    = kibnal_data.kib_port,
+                .modify.params.init.access_control  = vv_acc_r_mem_write | vv_acc_r_mem_read,
+        };
+        retval = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &qp_attr, &conn->ibc_qp_attrs);
+        if (retval != 0) {
+                CERROR ("Failed to modify queue pair: %d\n", retval);
+                goto failed;
+        }
+
+        retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs);
+        if (retval) {
+                CERROR ("Failed to query queue pair: %d\n", retval);
+                goto failed;
+        }
+
+        /* 1 ref for caller */
+        atomic_set (&conn->ibc_refcount, 1);
+        return (conn);
+        
+ failed:
+        kibnal_destroy_conn (conn);
+        return (NULL);
+}
+
+void
+kibnal_destroy_conn (kib_conn_t *conn)
+{
+        vv_return_t retval;
+        
+        CDEBUG (D_NET, "connection %p\n", conn);
+
+        LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+        LASSERT (list_empty(&conn->ibc_tx_queue));
+        LASSERT (list_empty(&conn->ibc_active_txs));
+        LASSERT (conn->ibc_nsends_posted == 0);
+        LASSERT (conn->ibc_connreq == NULL);
+
+        switch (conn->ibc_state) {
+        case IBNAL_CONN_DISCONNECTED:
+                /* called after connection sequence initiated */
+                /* fall through */
+
+        case IBNAL_CONN_INIT_QP:
+                /* _destroy includes an implicit Reset of the QP which 
+                 * discards posted work */
+                retval = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp);
+                if (retval)
+                        CERROR("Can't destroy QP: %d\n", retval);
+                /* fall through */
+                
+        case IBNAL_CONN_INIT_NOTHING:
+                break;
+
+        default:
+                LASSERT (0);
+        }
+
+        if (conn->ibc_cep != NULL) {
+                retval = cm_destroy_cep(conn->ibc_cep);
+                if (retval)
+                        CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, 
+                               retval);
+        }
+
+        if (conn->ibc_rx_pages != NULL) 
+                kibnal_free_pages(conn->ibc_rx_pages);
+        
+        if (conn->ibc_rxs != NULL)
+                PORTAL_FREE(conn->ibc_rxs, 
+                            IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+        if (conn->ibc_peer != NULL)
+                kib_peer_decref(conn->ibc_peer);
+
+        PORTAL_FREE(conn, sizeof (*conn));
+
+        atomic_dec(&kibnal_data.kib_nconns);
+        
+        if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+            kibnal_data.kib_shutdown) {
+                /* I just nuked the last connection on shutdown; wake up
+                 * everyone so they can exit. */
+                wake_up_all(&kibnal_data.kib_sched_waitq);
+                wake_up_all(&kibnal_data.kib_connd_waitq);
+        }
+}
+
+void
+kibnal_put_conn (kib_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
+                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                atomic_read (&conn->ibc_refcount));
+
+        LASSERT (atomic_read (&conn->ibc_refcount) > 0);
+        if (!atomic_dec_and_test (&conn->ibc_refcount))
+                return;
+
+        /* must disconnect before dropping the final ref */
+        LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
+
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+        list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+}
+
+static int
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+        kib_conn_t         *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                count++;
+                kibnal_close_conn_locked (conn, why);
+        }
+
+        return (count);
+}
+
+int
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
+{
+        kib_conn_t         *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                if (conn->ibc_incarnation == incarnation)
+                        continue;
+
+                CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
+                       peer->ibp_nid, conn->ibc_incarnation, incarnation);
+                
+                count++;
+                kibnal_close_conn_locked (conn, -ESTALE);
+        }
+
+        return (count);
+}
+
+static int
+kibnal_close_matching_conns (ptl_nid_t nid)
+{
+        unsigned long       flags;
+        kib_peer_t         *peer;
+        struct list_head   *ptmp;
+        struct list_head   *pnxt;
+        int                 lo;
+        int                 hi;
+        int                 i;
+        int                 count = 0;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        if (nid != PTL_NID_ANY)
+                lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+        else {
+                lo = 0;
+                hi = kibnal_data.kib_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+
+                        peer = list_entry (ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 peer->ibp_connecting != 0 ||
+                                 !list_empty (&peer->ibp_conns));
+
+                        if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+                                continue;
+
+                        count += kibnal_close_peer_conns_locked (peer, 0);
+                }
+        }
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        /* wildcards always succeed */
+        if (nid == PTL_NID_ANY)
+                return (0);
+        
+        return (count == 0 ? -ENOENT : 0);
+}
+
+static int
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
+{
+        int rc = -EINVAL;
+        ENTRY;
+
+        LASSERT (pcfg != NULL);
+
+        switch(pcfg->pcfg_command) {
+        case NAL_CMD_GET_PEER: {
+                ptl_nid_t   nid = 0;
+                int         share_count = 0;
+
+                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                                          &nid, &share_count);
+                pcfg->pcfg_nid   = nid;
+                pcfg->pcfg_size  = 0;
+                pcfg->pcfg_id    = 0;
+                pcfg->pcfg_misc  = 0;
+                pcfg->pcfg_count = 0;
+                pcfg->pcfg_wait  = share_count;
+                break;
+        }
+        case NAL_CMD_ADD_PEER: {
+                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
+                break;
+        }
+        case NAL_CMD_DEL_PEER: {
+                rc = kibnal_del_peer (pcfg->pcfg_nid, 
+                                       /* flags == single_share */
+                                       pcfg->pcfg_flags != 0);
+                break;
+        }
+        case NAL_CMD_GET_CONN: {
+                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+
+                if (conn == NULL)
+                        rc = -ENOENT;
+                else {
+                        rc = 0;
+                        pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
+                        pcfg->pcfg_id    = 0;
+                        pcfg->pcfg_misc  = 0;
+                        pcfg->pcfg_flags = 0;
+                        kibnal_put_conn (conn);
+                }
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                if (pcfg->pcfg_nid == PTL_NID_ANY)
+                        rc = -EINVAL;
+                else
+                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
+                break;
+        }
+        }
+
+        RETURN(rc);
+}
+
+void
+kibnal_free_pages (kib_pages_t *p)
+{
+        int     npages = p->ibp_npages;
+        vv_return_t retval;
+        int     i;
+        
+        if (p->ibp_mapped) {
+                retval = vv_mem_region_destroy(kibnal_data.kib_hca, p->ibp_handle);
+                if (retval != 0)
+                        CERROR ("Deregister error: %d\n", retval);
+        }
+        
+        for (i = 0; i < npages; i++)
+                if (p->ibp_pages[i] != NULL)
+                        __free_page(p->ibp_pages[i]);
+        
+        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
+{
+        kib_pages_t   *p;
+        vv_phy_list_t  phys_pages;
+        vv_phy_buf_t  *phys_buf;
+        int            i;
+        vv_return_t    retval;
+
+        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+        if (p == NULL) {
+                CERROR ("Can't allocate buffer %d\n", npages);
+                return (-ENOMEM);
+        }
+
+        memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+        p->ibp_npages = npages;
+        
+        for (i = 0; i < npages; i++) {
+                p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+                if (p->ibp_pages[i] == NULL) {
+                        CERROR ("Can't allocate page %d of %d\n", i, npages);
+                        kibnal_free_pages(p);
+                        return (-ENOMEM);
+                }
+        }
+
+        if (kibnal_whole_mem())
+                goto out;
+
+        PORTAL_ALLOC(phys_buf, npages * sizeof(vv_phy_buf_t));
+        if (phys_buf == NULL) {
+                CERROR ("Can't allocate phys_buf for %d pages\n", npages);
+                /* XXX free ibp_pages? */
+                kibnal_free_pages(p);
+                return (-ENOMEM);
+        }
+
+        phys_pages.number_of_buff = npages;
+        phys_pages.phy_list = phys_buf;
+
+        /* if we were using the _contig_ registration variant we would have
+         * an array of PhysAddr/Length pairs, but the discontiguous variant
+         * just takes the PhysAddr */
+        for (i = 0; i < npages; i++) {
+                phys_buf[i].start = kibnal_page2phys(p->ibp_pages[i]);
+                phys_buf[i].size = PAGE_SIZE;
+        }
+
+        retval = vv_phy_mem_region_register(kibnal_data.kib_hca,
+                                            &phys_pages,
+                                            0, /* requested vaddr */
+                                            npages * PAGE_SIZE,
+                                            0, /* offset */
+                                            kibnal_data.kib_pd,
+                                            vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind, /* TODO: translated as-is, but seems incorrect or too much */
+                                            &p->ibp_handle, &p->ibp_vaddr,                                           
+                                            &p->ibp_lkey, &p->ibp_rkey);
+        
+        PORTAL_FREE(phys_buf, npages * sizeof(vv_phy_buf_t));
+        
+        if (retval) {
+                CERROR ("Error %d mapping %d pages\n", retval, npages);
+                kibnal_free_pages(p);
+                return (-ENOMEM);
+        }
+
+        CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" "
+                      "lkey %x rkey %x\n", npages, p->ibp_handle,
+                      p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
+        
+        p->ibp_mapped = 1;
+out:
+        *pp = p;
+        return (0);
+}
+
+static int
+kibnal_setup_tx_descs (void)
+{
+        int           ipage = 0;
+        int           page_offset = 0;
+        __u64         vaddr;
+        __u64         vaddr_base;
+        struct page  *page;
+        kib_tx_t     *tx;
+        int           i;
+        int           rc;
+
+        /* pre-mapped messages are not bigger than 1 page */
+        LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+
+        /* No fancy arithmetic when we do the buffer calculations */
+        LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+
+        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
+                                0);
+        if (rc != 0)
+                return (rc);
+
+        /* ignored for the whole_mem case */
+        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
+
+        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+                page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+                tx = &kibnal_data.kib_tx_descs[i];
+
+                memset (tx, 0, sizeof(*tx));    /* zero flags etc */
+                
+                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
+                                           page_offset);
+
+                if (kibnal_whole_mem()) {
+                        void *newaddr;
+                        vv_mem_reg_h_t mem_h;
+                        vv_return_t  retval;
+
+                        /* Voltaire stack already registers the whole
+                         * memory, so use that API. */
+                        retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+                                                      tx->tx_msg,
+                                                      IBNAL_MSG_SIZE,
+                                                      &mem_h,
+                                                      &tx->l_key,
+                                                      &tx->r_key);
+                        if (retval) {
+                                CERROR("vv_get_gen_mr_attrib failed: %d", retval);
+                                /* TODO: free pages? */
+                                /* TODO: return. */
+                        }
+                }
+
+                tx->tx_isnblk = (i >= IBNAL_NTX);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+
+                CDEBUG(D_NET, "Tx[%d] %p->%p\n", i, tx, tx->tx_msg);
+
+                if (tx->tx_isnblk)
+                        list_add (&tx->tx_list, 
+                                  &kibnal_data.kib_idle_nblk_txs);
+                else
+                        list_add (&tx->tx_list, 
+                                  &kibnal_data.kib_idle_txs);
+
+                vaddr += IBNAL_MSG_SIZE;
+                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+
+                page_offset += IBNAL_MSG_SIZE;
+                LASSERT (page_offset <= PAGE_SIZE);
+
+                if (page_offset == PAGE_SIZE) {
+                        page_offset = 0;
+                        ipage++;
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+                }
+        }
+        
+        return (0);
+}
+
+static void
+kibnal_api_shutdown (nal_t *nal)
+{
+        int   i;
+        int   rc;
+        vv_return_t retval;
+
+        if (nal->nal_refct != 0) {
+                /* This module got the first ref */
+                PORTAL_MODULE_UNUSE;
+                return;
+        }
+
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        LASSERT(nal == &kibnal_api);
+
+        switch (kibnal_data.kib_init) {
+
+        case IBNAL_INIT_ALL:
+                /* stop calls to nal_cmd */
+                libcfs_nal_cmd_unregister(VIBNAL);
+                /* No new peers */
+
+                /* resetting my NID to unadvertises me, removes my
+                 * listener and nukes all current peers */
+                kibnal_set_mynid (PTL_NID_ANY);
+
+                /* Wait for all peer state to clean up (crazy) */
+                i = 2;
+                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "waiting for %d peers to disconnect (can take a few seconds)\n",
+                               atomic_read (&kibnal_data.kib_npeers));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+                /* fall through */
+
+        case IBNAL_INIT_CQ:
+                retval = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq);
+                if (retval)
+                        CERROR ("Destroy CQ error: %d\n", retval);
+                /* fall through */
+
+        case IBNAL_INIT_TXD:
+                kibnal_free_pages (kibnal_data.kib_tx_pages);
+                /* fall through */
+
+#if IBNAL_FMR
+        case IBNAL_INIT_FMR:
+                rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
+                if (rc != 0)
+                        CERROR ("Destroy FMR pool error: %d\n", rc);
+                /* fall through */
+#endif
+        case IBNAL_INIT_PD:
+#if IBNAL_WHOLE_MEM==0
+                retval = vv_pd_deallocate(kibnal_data.kib_hca, kibnal_data.kib_pd);
+                if (retval != 0)
+                        CERROR ("Destroy PD error: %d\n", retval);
+#endif
+                /* fall through */
+
+        case IBNAL_INIT_GSI:
+                retval = gsi_deregister_class(kibnal_data.gsi_handle);
+                if (retval != 0)
+                        CERROR ("GSI deregister failed: %d\n", retval);
+                /* fall through */
+
+        case IBNAL_INIT_GSI_POOL:
+                gsi_dtgrm_pool_destroy(kibnal_data.gsi_pool_handle);
+                /* fall through */
+
+        case IBNAL_INIT_PORT:
+                /* XXX ??? */
+                /* fall through */
+
+        case IBNAL_INIT_ASYNC:
+                retval = vv_dell_async_event_cb (kibnal_data.kib_hca,
+                                                 kibnal_ca_async_callback);
+                if (retval)
+                        CERROR("deregister asynchronous call back error: %d\n", retval);
+                        
+                /* fall through */
+
+        case IBNAL_INIT_HCA:
+                retval = vv_hca_close(kibnal_data.kib_hca);
+                if (retval != 0)
+                        CERROR ("Close HCA  error: %d\n", retval);
+                /* fall through */
+
+        case IBNAL_INIT_LIB:
+                lib_fini(&kibnal_lib);
+                /* fall through */
+
+        case IBNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all peers
+                 * have been closed so all lists must be empty */
+                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (kibnal_data.kib_peers != NULL);
+                for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+                        LASSERT (list_empty (&kibnal_data.kib_peers[i]));
+                }
+                LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+                LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+                LASSERT (list_empty (&kibnal_data.kib_connd_peers));
+
+                /* flag threads to terminate; wake and wait for them to die */
+                kibnal_data.kib_shutdown = 1;
+                wake_up_all (&kibnal_data.kib_sched_waitq);
+                wake_up_all (&kibnal_data.kib_connd_waitq);
+
+                i = 2;
+                while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "Waiting for %d threads to terminate\n",
+                               atomic_read (&kibnal_data.kib_nthreads));
+                        set_current_state (TASK_INTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+                /* fall through */
+                
+        case IBNAL_INIT_NOTHING:
+                break;
+        }
+
+        if (kibnal_data.kib_tx_descs != NULL)
+                PORTAL_FREE (kibnal_data.kib_tx_descs,
+                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
+
+        if (kibnal_data.kib_peers != NULL)
+                PORTAL_FREE (kibnal_data.kib_peers,
+                             sizeof (struct list_head) * 
+                             kibnal_data.kib_peer_hash_size);
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+        printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+
+        kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+}
+
+#define roundup_power(val, power) \
+        ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
+
+/* this isn't very portable or sturdy in the face of funny mem/bus configs */
+static __u64 max_phys_mem(void)
+{
+        struct sysinfo si;
+        __u64 ret;
+
+        si_meminfo(&si);
+        ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
+        return roundup_power(ret, 128 * 1024 * 1024);
+} 
+#undef roundup_power
+
+static int
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+                     ptl_ni_limits_t *requested_limits,
+                     ptl_ni_limits_t *actual_limits)
+{
+        ptl_process_id_t    process_id;
+        int                 pkmem = atomic_read(&portal_kmemory);
+        int                 rc;
+        int                 i;
+        vv_request_event_record_t req_er;
+        vv_return_t         retval;
+
+        LASSERT (nal == &kibnal_api);
+
+        if (nal->nal_refct != 0) {
+                if (actual_limits != NULL)
+                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
+                /* This module got the first ref */
+                PORTAL_MODULE_USE;
+                return (PTL_OK);
+        }
+
+        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+
+        init_MUTEX (&kibnal_data.kib_nid_mutex);
+        kibnal_data.kib_nid = PTL_NID_ANY;
+
+        rwlock_init(&kibnal_data.kib_global_lock);
+
+        kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+        PORTAL_ALLOC (kibnal_data.kib_peers,
+                      sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+        if (kibnal_data.kib_peers == NULL) {
+                goto failed;
+        }
+        for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+                INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+        spin_lock_init (&kibnal_data.kib_connd_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+        init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+        spin_lock_init (&kibnal_data.kib_sched_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+        init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+        spin_lock_init (&kibnal_data.kib_tx_lock);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+        INIT_LIST_HEAD (&kibnal_data.gsi_pending);
+        init_MUTEX (&kibnal_data.gsi_mutex);
+
+        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        if (kibnal_data.kib_tx_descs == NULL) {
+                CERROR ("Can't allocate tx descs\n");
+                goto failed;
+        }
+
+        /* lists/ptrs/locks initialised */
+        kibnal_data.kib_init = IBNAL_INIT_DATA;
+        /*****************************************************/
+
+        process_id.pid = requested_pid;
+        process_id.nid = kibnal_data.kib_nid;
+        
+        rc = lib_init(&kibnal_lib, nal, process_id,
+                      requested_limits, actual_limits);
+        if (rc != PTL_OK) {
+                CERROR("lib_init failed: error %d\n", rc);
+                goto failed;
+        }
+
+        /* lib interface initialised */
+        kibnal_data.kib_init = IBNAL_INIT_LIB;
+        /*****************************************************/
+
+        for (i = 0; i < IBNAL_N_SCHED; i++) {
+                rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
+                if (rc != 0) {
+                        CERROR("Can't spawn vibnal scheduler[%d]: %d\n",
+                               i, rc);
+                        goto failed;
+                }
+        }
+
+        rc = kibnal_thread_start (kibnal_connd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't spawn vibnal connd: %d\n", rc);
+                goto failed;
+        }
+
+        /* TODO: apparently only one adapter is supported */
+        retval = vv_hca_open("ANY_HCA", NULL, &kibnal_data.kib_hca);
+        if (retval) {
+                CERROR ("Can't open CA: %d\n", retval);
+                goto failed;
+        }
+
+        /* Channel Adapter opened */
+        kibnal_data.kib_init = IBNAL_INIT_HCA;
+
+        /* register to get HCA's asynchronous events. */
+        req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK;
+        retval = vv_set_async_event_cb (kibnal_data.kib_hca,
+                                        req_er,
+                                        kibnal_ca_async_callback);
+
+        if (retval) {
+                CERROR ("Can't open CA: %d\n", retval);
+                goto failed; 
+        }
+
+        kibnal_data.kib_init = IBNAL_INIT_ASYNC;
+
+        /*****************************************************/
+
+        retval = vv_hca_query(kibnal_data.kib_hca,
+                             &kibnal_data.kib_hca_attrs);
+        if (retval) {
+                CERROR ("Can't size port attrs: %d\n", retval);
+                goto failed;
+        }
+
+        kibnal_data.kib_port = -1;
+
+        for (i = 0; i<kibnal_data.kib_hca_attrs.port_num; i++) {
+
+                int port_num = i+1;
+                u_int32_t tbl_count;
+                vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr;
+
+                retval = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
+                if (retval) {
+                        CERROR("vv_port_query failed for port %d: %d\n", port_num, retval);
+                        continue;
+                }
+
+                switch (pattr->port_state) {
+                case vv_state_linkDoun:
+                        CDEBUG(D_NET, "port[%d] Down\n", port_num);
+                        continue;
+                case vv_state_linkInit:
+                        CDEBUG(D_NET, "port[%d] Init\n", port_num);
+                        continue;
+                case vv_state_linkArm:
+                        CDEBUG(D_NET, "port[%d] Armed\n", port_num);
+                        continue;
+                case vv_state_linkActive:
+                        CDEBUG(D_NET, "port[%d] Active\n", port_num);
+
+                        /* Found a suitable port. Get its GUID and PKEY. */
+                        kibnal_data.kib_port = port_num;
+                        
+                        tbl_count = 1;
+                        retval = vv_get_port_gid_tbl(kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_gid);
+                        if (retval) {
+                                CERROR("vv_get_port_gid_tbl failed for port %d: %d\n", port_num, retval);
+                                continue;
+                        }
+
+                        tbl_count = 1;
+                        retval = vv_get_port_partition_tbl (kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_pkey);
+                        if (retval) {
+                                CERROR("vv_get_port_partition_tbl failed for port %d: %d\n", port_num, retval);
+                                continue;
+                        }
+
+                        break;
+                case vv_state_linkActDefer: /* TODO: correct? */
+                case vv_state_linkNoChange:
+                        CERROR("Unexpected port[%d] state %d\n",
+                               i, pattr->port_state);
+                        continue;
+                }
+                break;
+        }
+
+        if (kibnal_data.kib_port == -1) {
+                CERROR ("Can't find an active port\n");
+                goto failed;
+        }
+
+        CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n",
+               kibnal_data.kib_port, kibnal_data.kib_port_gid.scope.g.subnet, kibnal_data.kib_port_gid.scope.g.eui64);
+        CDEBUG(D_NET, "got guid "LPX64"\n", cpu_to_le64(kibnal_data.kib_port_gid.scope.g.eui64));
+        
+        /* Active port found */
+        kibnal_data.kib_init = IBNAL_INIT_PORT;
+        /*****************************************************/
+
+        /* Prepare things to be able to send/receive MADS */
+        retval = gsi_dtgrm_pool_create(IBNAL_CONCURRENT_PEERS, &kibnal_data.gsi_pool_handle);
+        if (retval) {
+                CERROR("Could not create GSI pool: %d\n", retval);
+                goto failed;
+        }
+        kibnal_data.kib_init = IBNAL_INIT_GSI_POOL;
+
+        retval = gsi_register_class(MAD_CLASS_SUBN_ADM, /* TODO: correct? */
+                                2,     /* version */
+                                "ANY_HCA",
+#ifdef GSI_PASS_PORT_NUM
+                                kibnal_data.kib_port,
+#endif                   
+                                0, 0,
+                                vibnal_mad_sent_cb,    vibnal_mad_received_cb,
+                                NULL, &kibnal_data.gsi_handle);
+        if (retval) {
+                CERROR("Cannot register GSI class: %d\n", retval);
+                goto failed;
+        }
+
+        kibnal_data.kib_init = IBNAL_INIT_GSI;
+        /*****************************************************/
+
+#if IBNAL_WHOLE_MEM==0
+        retval = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
+#else
+        retval = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
+#endif
+        if (retval) {
+                CERROR ("Can't create PD: %d\n", retval);
+                goto failed;
+        }
+        
+        /* flag PD initialised */
+        kibnal_data.kib_init = IBNAL_INIT_PD;
+        /*****************************************************/
+
+#if IBNAL_FMR
+        {
+                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
+                struct ib_fmr_pool_param params = {
+                        .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
+                        .access            = (IB_ACCESS_LOCAL_WRITE |
+                                              IB_ACCESS_REMOTE_WRITE |
+                                              IB_ACCESS_REMOTE_READ),
+                        .pool_size         = pool_size,
+                        .dirty_watermark   = (pool_size * 3)/4,
+                        .flush_function    = NULL,
+                        .flush_arg         = NULL,
+                        .cache             = 1,
+                };
+                rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
+                                        &kibnal_data.kib_fmr_pool);
+                if (rc != 0) {
+                        CERROR ("Can't create FMR pool size %d: %d\n", 
+                                pool_size, rc);
+                        goto failed;
+                }
+        }
+
+        /* flag FMR pool initialised */
+        kibnal_data.kib_init = IBNAL_INIT_FMR;
+#endif
+
+        /*****************************************************/
+
+        rc = kibnal_setup_tx_descs();
+        if (rc != 0) {
+                CERROR ("Can't register tx descs: %d\n", rc);
+                goto failed;
+        }
+        
+        /* flag TX descs initialised */
+        kibnal_data.kib_init = IBNAL_INIT_TXD;
+        /*****************************************************/
+        {
+                uint32_t nentries;
+
+                retval = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
+                                      kibnal_ca_callback, 
+                                      NULL, /* context */
+                                      &kibnal_data.kib_cq, &nentries);
+                if (retval) {
+                        CERROR ("Can't create RX CQ: %d\n", retval);
+                        goto failed;
+                }
+
+                /* flag CQ initialised */
+                kibnal_data.kib_init = IBNAL_INIT_CQ;
+
+                if (nentries < IBNAL_CQ_ENTRIES) {
+                        CERROR ("CQ only has %d entries, need %d\n", 
+                                nentries, IBNAL_CQ_ENTRIES);
+                        goto failed;
+                }
+
+                retval = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event);
+                if (retval != 0) {
+                        CERROR ("Failed to re-arm completion queue: %d\n", rc);
+                        goto failed;
+                }
+        }
+        
+        /*****************************************************/
+
+        rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                goto failed;
+        }
+
+        /* flag everything initialised */
+        kibnal_data.kib_init = IBNAL_INIT_ALL;
+        /*****************************************************/
+
+        printk(KERN_INFO "Lustre: Voltaire IB NAL loaded "
+               "(initial mem %d)\n", pkmem);
+
+        return (PTL_OK);
+
+ failed:
+        CDEBUG(D_NET, "kibnal_api_startup failed\n");
+        kibnal_api_shutdown (&kibnal_api);    
+        return (PTL_FAIL);
+}
+
+void __exit
+kibnal_module_fini (void)
+{
+#ifdef CONFIG_SYSCTL
+        if (kibnal_tunables.kib_sysctl != NULL)
+                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
+#endif
+        PtlNIFini(kibnal_ni);
+
+        ptl_unregister_nal(VIBNAL);
+}
+
+int __init
+kibnal_module_init (void)
+{
+        int    rc;
+
+        if (sizeof(kib_wire_connreq_t) > cm_REQ_priv_data_len) {
+                CERROR("sizeof(kib_wire_connreq_t) > cm_REQ_priv_data_len\n");
+                return -EINVAL;
+        }
+
+        /* the following must be sizeof(int) for proc_dointvec() */
+        if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
+                CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
+                return -EINVAL;
+        }
+
+        kibnal_api.nal_ni_init = kibnal_api_startup;
+        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
+
+        /* Initialise dynamic tunables to defaults once only */
+        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+
+        rc = ptl_register_nal(VIBNAL, &kibnal_api);
+        if (rc != PTL_OK) {
+                CERROR("Can't register IBNAL: %d\n", rc);
+                return (-ENOMEM);               /* or something... */
+        }
+
+        /* Pure gateways want the NAL started up at module load time... */
+        rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
+        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+                ptl_unregister_nal(VIBNAL);
+                return (-ENODEV);
+        }
+        
+#ifdef CONFIG_SYSCTL
+        /* Press on regardless even if registering sysctl doesn't work */
+        kibnal_tunables.kib_sysctl = 
+                register_sysctl_table (kibnal_top_ctl_table, 0);
+#endif
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
+
diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h

new file mode 100644 (file)

index 0000000..7866aba
--- /dev/null
+++ b/lnet/klnds/viblnd/viblnd.h
@@ -0,0 +1,820 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Frank Zago <fzago@systemfabricworks.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#define DEBUG_SUBSYSTEM S_IBNAL
+
+#define IBNAL_CHECK_ADVERT
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/nal.h>
+
+#include <vverbs.h>
+#include <sa-mads.h>
+#include <ib-cm.h>
+#include <gsi.h>
+
+#if 0
+#undef CDEBUG
+#define CDEBUG(mask, format, a...) printk(KERN_INFO "%s:%d - " format, __func__, __LINE__,##a)
+#endif
+
+#ifdef __CHECKER__
+#undef CDEBUG
+#undef CERROR
+#define CDEBUG(a...)
+#define CERROR(a...)
+#endif
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+                + __GNUC_MINOR__ * 100 \
+                + __GNUC_PATCHLEVEL__)
+
+/* Test for GCC > 3.2.2 */
+#if GCC_VERSION <= 30202
+/* GCC 3.2.2, and presumably several versions before it, will
+ * miscompile this driver. See
+ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */
+#error Invalid GCC version. Must use GCC >= 3.2.3
+#endif
+
+#define IBNAL_SERVICE_NAME   "vibnal"
+#define IBNAL_SERVICE_NUMBER 0x11b9a2 /* TODO */
+
+#if CONFIG_SMP
+# define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
+#else
+# define IBNAL_N_SCHED      1                   /* # schedulers */
+#endif
+
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
+
+#define IBNAL_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
+
+#define IBNAL_MSG_QUEUE_SIZE   8                /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 7                /* when to eagerly return credits */
+
+/* 7 indicates infinite retry attempts, Infinicon recommended 5 */
+#define IBNAL_RETRY            5                /* # times to retry */
+#define IBNAL_RNR_RETRY        5                /*  */
+#define IBNAL_CM_RETRY         5                /* # times to retry connection */
+
+#define IBNAL_FLOW_CONTROL     1
+#define IBNAL_ACK_TIMEOUT       20              /* supposedly 4 secs */
+
+#define IBNAL_NTX             64                /* # tx descs */
+/* this had to be dropped down so that we only register < 255 pages per
+ * region.  this will change if we register all memory. */
+#define IBNAL_NTX_NBLK        128               /* # reserved tx descs */
+
+#define IBNAL_PEER_HASH_SIZE  101               /* # peer lists */
+
+#define IBNAL_RESCHED         100               /* # scheduler loops before reschedule */
+
+#define IBNAL_CONCURRENT_PEERS 1000             /* # nodes all talking at once to me */
+
+/* default vals for runtime tunables */
+#define IBNAL_IO_TIMEOUT      50                /* default comms timeout (seconds) */
+
+/************************/
+/* derived constants... */
+
+/* TX messages (shared by all connections) */
+#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1)
+
+/* RX messages (per connection) */
+#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+
+/* we may have up to 2 completions per transmit +
+   1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
+                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
+
+#define IBNAL_RDMA_BASE  0x0eeb0000
+#define IBNAL_FMR        0
+#define IBNAL_WHOLE_MEM  1
+#define IBNAL_CKSUM      0
+
+/* Starting sequence number. */
+#define IBNAL_STARTING_PSN 0x465A
+
+/* Timeout for SA requests, in seconds */
+#define GSI_TIMEOUT 5
+#define GSI_RETRY 10
+
+typedef struct 
+{
+        int               kib_io_timeout;       /* comms timeout (seconds) */
+        struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+} kib_tunables_t;
+
+/* some of these have specific types in the stack that just map back
+ * to the uFOO types, like IB_{L,R}_KEY. */
+typedef struct
+{
+        int               ibp_npages;           /* # pages */
+        int               ibp_mapped;           /* mapped? */
+        __u64             ibp_vaddr;            /* mapped region vaddr */
+        __u32             ibp_lkey;             /* mapped region lkey */
+        __u32             ibp_rkey;             /* mapped region rkey */
+        vv_mem_reg_h_t    ibp_handle;           /* mapped region handle */
+        struct page      *ibp_pages[0];
+} kib_pages_t;
+
+typedef struct
+{
+        vv_mem_reg_h_t    md_handle;
+        __u32             md_lkey;
+        __u32             md_rkey;
+        __u64             md_addr;
+} kib_md_t __attribute__((packed));
+        
+typedef struct 
+{
+        /* initialisation state. These values are sorted by their initialization order. */
+        enum {
+                IBNAL_INIT_NOTHING,
+                IBNAL_INIT_DATA,
+                IBNAL_INIT_LIB,
+                IBNAL_INIT_HCA,
+                IBNAL_INIT_ASYNC,
+                IBNAL_INIT_PORT,
+                IBNAL_INIT_GSI_POOL,
+                IBNAL_INIT_GSI,
+                IBNAL_INIT_PD,
+#if IBNAL_FMR
+                IBNAL_INIT_FMR,
+#endif
+                IBNAL_INIT_TXD,
+                IBNAL_INIT_CQ,
+                IBNAL_INIT_ALL,
+        } kib_init;
+
+        __u64             kib_incarnation;      /* which one am I */
+        int               kib_shutdown;         /* shut down? */
+        atomic_t          kib_nthreads;         /* # live threads */
+
+        __u64             kib_service_id;       /* service number I listen on */
+        vv_gid_t          kib_port_gid;         /* port GID in HOST ORDER! */
+        vv_p_key_t        kib_port_pkey;        /* my pkey */
+        ptl_nid_t         kib_nid;              /* my NID */
+        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
+        cm_cep_handle_t   kib_cep;              /* connection end point */
+
+        rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
+
+        struct list_head *kib_peers;            /* hash table of all my known peers */
+        int               kib_peer_hash_size;   /* size of kib_peers */
+        atomic_t          kib_npeers;           /* # peers extant */
+        atomic_t          kib_nconns;           /* # connections extant */
+
+        struct list_head  kib_connd_conns;      /* connections to progress */
+        struct list_head  kib_connd_peers;      /* peers waiting for a connection */
+        wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
+        unsigned long     kib_connd_waketime;   /* when connd will wake */
+        spinlock_t        kib_connd_lock;       /* serialise */
+
+        wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
+        struct list_head  kib_sched_txq;        /* tx requiring attention */
+        struct list_head  kib_sched_rxq;        /* rx requiring attention */
+        spinlock_t        kib_sched_lock;       /* serialise */
+        
+        struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
+        kib_pages_t      *kib_tx_pages;         /* premapped tx msg pages */
+
+        struct list_head  kib_idle_txs;         /* idle tx descriptors */
+        struct list_head  kib_idle_nblk_txs;    /* idle reserved tx descriptors */
+        wait_queue_head_t kib_idle_tx_waitq;    /* block here for tx descriptor */
+        __u64             kib_next_tx_cookie;   /* RDMA completion cookie */
+        spinlock_t        kib_tx_lock;          /* serialise */
+        
+        vv_hca_h_t        kib_hca;              /* The HCA */
+        vv_hca_attrib_t   kib_hca_attrs;      /* HCA attributes */
+
+        int               kib_port;             /* port on the device */
+        vv_port_attrib_t  kib_port_attr;      /* port attributes */
+
+        vv_pd_h_t         kib_pd;               /* protection domain */
+        vv_cq_h_t         kib_cq;               /* completion queue */
+
+        void             *kib_listen_handle;    /* where I listen for connections */
+
+        /* These fields are left untouched, so they can be shared. */
+        union { 
+                cm_drequest_data_t dreq_data;
+                cm_dreply_data_t   drep_data;
+        } cm_data;
+
+        /* Send and receive MADs (service records, path records) */
+        gsi_class_handle_t      gsi_handle;
+        gsi_dtgrm_pool_handle_t gsi_pool_handle;
+        struct semaphore gsi_mutex; /* protect GSI list - TODO:spinlock instead? */
+        struct list_head gsi_pending; /* pending GSI datagrams */
+
+} kib_data_t;
+
+/************************************************************************
+ * Wire message structs.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD
+ * private data and SM service info), is LE on the wire.
+ */
+
+/* also kib_md_t above */
+
+typedef struct
+{
+        __u32                 rd_nob;           /* # of bytes */
+        __u64                 rd_addr;          /* remote io vaddr */
+} kib_rdma_desc_t __attribute__((packed));
+
+typedef struct
+{
+        ptl_hdr_t         ibim_hdr;             /* portals header */
+        char              ibim_payload[0];      /* piggy-backed payload */
+} kib_immediate_msg_t __attribute__((packed));
+
+/* these arrays serve two purposes during rdma.  they are built on the passive
+ * side and sent to the active side as remote arguments.  On the active side
+ * the descs are used as a data structure on the way to local gather items. 
+ * the different roles result in split local/remote meaning of desc->rd_key */
+typedef struct
+{
+        ptl_hdr_t         ibrm_hdr;             /* portals header */
+        __u64             ibrm_cookie;          /* opaque completion cookie */
+        __u32             ibrm_num_descs;       /* how many descs */
+        __u32             rd_key;               /* remote key */
+        kib_rdma_desc_t   ibrm_desc[0];         /* where to suck/blow */
+} kib_rdma_msg_t __attribute__((packed));
+
+#define kib_rdma_msg_len(num_descs) \
+        offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs])
+
+typedef struct
+{
+        __u64             ibcm_cookie;          /* opaque completion cookie */
+        __u32             ibcm_status;          /* completion status */
+} kib_completion_msg_t __attribute__((packed));
+
+typedef struct
+{
+        __u32              ibm_magic;           /* I'm an openibnal message */
+        __u16              ibm_version;         /* this is my version number */
+        __u8               ibm_type;            /* msg type */
+        __u8               ibm_credits;         /* returned credits */
+#if IBNAL_CKSUM
+        __u32              ibm_nob;
+        __u32              ibm_cksum;
+#endif
+        union {
+                kib_immediate_msg_t   immediate;
+                kib_rdma_msg_t        rdma;
+                kib_completion_msg_t  completion;
+        } ibm_u __attribute__((packed));
+} kib_msg_t __attribute__((packed));
+
+#define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
+#define IBNAL_MSG_VERSION              1        /* current protocol version */
+
+#define IBNAL_MSG_NOOP              0xd0        /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE         0xd1        /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA          0xd2        /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE          0xd3        /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA          0xd4        /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE          0xd5        /* signal GET rdma completion */
+
+/***********************************************************************/
+
+typedef struct kib_rx                           /* receive message */
+{
+        struct list_head          rx_list;      /* queue for attention */
+        struct kib_conn          *rx_conn;      /* owning conn */
+        int                       rx_rdma;      /* RDMA completion posted? */
+        int                       rx_posted;    /* posted? */
+        kib_msg_t                *rx_msg;     /* pre-mapped buffer */
+        vv_l_key_t                l_key;
+        vv_wr_t                   rx_wrq;
+        vv_scatgat_t              rx_gl;        /* and its memory */
+} kib_rx_t;
+
+typedef struct kib_tx                           /* transmit message */
+{
+        struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+        int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
+        struct kib_conn          *tx_conn;      /* owning conn */
+        int                       tx_mapped;    /* mapped for RDMA? */
+        int                       tx_sending;   /* # tx callbacks outstanding */
+        int                       tx_status;    /* completion status */
+        unsigned long             tx_deadline;  /* completion deadline */
+        int                       tx_passive_rdma; /* peer sucks/blows */
+        int                       tx_passive_rdma_wait; /* waiting for peer to complete */
+        __u64                     tx_passive_rdma_cookie; /* completion cookie */
+        lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
+        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
+        kib_msg_t                *tx_msg;       /* pre-mapped buffer */
+        vv_l_key_t                l_key;
+        vv_r_key_t                r_key;
+        int                       tx_nsp;       /* # send work items */
+        vv_wr_t                  tx_wrq[IBNAL_TX_MAX_SG];    /* send work items... */
+        vv_scatgat_t              tx_gl[IBNAL_TX_MAX_SG];     /* ...and their memory */
+} kib_tx_t;
+
+#define KIB_TX_UNMAPPED       0
+#define KIB_TX_MAPPED         1
+#define KIB_TX_MAPPED_FMR     2
+
+typedef struct kib_wire_connreq
+{
+        __u32        wcr_magic;                 /* I'm an openibnal connreq */
+        __u16        wcr_version;               /* this is my version number */
+        __u16        wcr_queue_depth;           /* this is my receive queue size */
+        __u64        wcr_nid;                   /* peer's NID */
+        __u64        wcr_incarnation;           /* peer's incarnation */
+} kib_wire_connreq_t;
+
+typedef struct kib_gid
+{
+        __u64   hi, lo;
+} kib_gid_t;
+
+typedef struct kib_connreq
+{
+        /* connection-in-progress */
+        struct kib_conn                    *cr_conn;
+        kib_wire_connreq_t                  cr_wcr;
+        __u64                               cr_tid;
+        //ib_service_record_v2_t              cr_service;
+        kib_gid_t                           cr_gid;
+        ib_path_record_v2_t                 cr_path;
+
+        union {
+                cm_request_data_t                   cr_cm_req;
+                cm_rtu_data_t                       cr_cm_rtu;
+        } ;
+
+} kib_connreq_t;
+
+typedef struct kib_conn
+{ 
+        struct kib_peer    *ibc_peer;           /* owning peer */
+        struct list_head    ibc_list;           /* stash on peer's conn list */
+        __u64               ibc_incarnation;    /* which instance of the peer */
+        atomic_t            ibc_refcount;       /* # users */
+        int                 ibc_state;          /* what's happening */
+        atomic_t            ibc_nob;            /* # bytes buffered */
+        int                 ibc_nsends_posted;  /* # uncompleted sends */
+        int                 ibc_credits;        /* # credits I have */
+        int                 ibc_outstanding_credits; /* # credits to return */
+        int                 ibc_rcvd_disconnect;/* received discon request */
+        int                 ibc_sent_disconnect;/* sent discon request */
+        struct list_head    ibc_tx_queue;       /* send queue */
+        struct list_head    ibc_active_txs;     /* active tx awaiting completion */
+        spinlock_t          ibc_lock;           /* serialise */
+        kib_rx_t           *ibc_rxs;            /* the rx descs */
+        kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
+        vv_qp_h_t           ibc_qp;             /* queue pair */
+        cm_cep_handle_t     ibc_cep;            /* connection ID? */
+        vv_qp_attr_t        ibc_qp_attrs;    /* QP attrs */
+        kib_connreq_t      *ibc_connreq;        /* connection request state */
+} kib_conn_t;
+
+#define IBNAL_CONN_INIT_NOTHING      0          /* initial state */
+#define IBNAL_CONN_INIT_QP           1          /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING        2          /* started to connect */
+#define IBNAL_CONN_ESTABLISHED       3          /* connection established */
+#define IBNAL_CONN_SEND_DREQ         4          /* to send disconnect req */
+#define IBNAL_CONN_DREQ              5          /* sent disconnect req */
+#define IBNAL_CONN_DREP              6          /* sent disconnect rep */
+#define IBNAL_CONN_DISCONNECTED      7          /* no more QP or CM traffic */
+
+#define KIB_ASSERT_CONN_STATE(conn, state) do {                         \
+        LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state);  \
+} while (0)
+
+#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do {               \
+        LASSERTF(low <= high, "%d %d\n", low, high);                    \
+        LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \
+                 "%d\n", conn->ibc_state);                              \
+} while (0)
+
+typedef struct kib_peer
+{
+        struct list_head    ibp_list;           /* stash on global peer list */
+        struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
+        ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
+        atomic_t            ibp_refcount;       /* # users */
+        int                 ibp_persistence;    /* "known" peer refs */
+        struct list_head    ibp_conns;          /* all active connections */
+        struct list_head    ibp_tx_queue;       /* msgs waiting for a conn */
+        int                 ibp_connecting;     /* connecting+accepting */
+        unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
+        unsigned long       ibp_reconnect_interval; /* exponential backoff */
+} kib_peer_t;
+
+struct sa_request;
+typedef void (*sa_request_cb_t)(struct sa_request *request);
+
+struct sa_request {
+        /* Link all the pending GSI datagrams together. */
+        struct list_head list;
+
+        int retry;              /* number of retries left (after a timeout only) */
+        int status;             /* status of the request */
+        gsi_dtgrm_t *dtgrm_req; /* request */
+        gsi_dtgrm_t *dtgrm_resp; /* response */
+        sa_mad_v2_t *mad;       /* points inside the datagram */
+
+        void *context;
+
+        struct timer_list timer;
+
+        /* When the requests is completed, we either call the callback
+         * or post a completion. They are mutually exclusive. */
+        struct completion signal;
+        sa_request_cb_t callback;
+};
+
+/* The CM callback are called on the interrupt level. However we
+ * cannot do everything we want on that level, so we let keventd run
+ * the callback. */
+struct cm_off_level {
+        struct tq_struct tq;
+
+        cm_cep_handle_t cep;
+        cm_conn_data_t *info;
+        kib_conn_t *conn;
+};
+
+extern lib_nal_t       kibnal_lib;
+extern kib_data_t      kibnal_data;
+extern kib_tunables_t  kibnal_tunables;
+
+static inline int wrq_signals_completion(vv_wr_t *wrq)
+{
+        return wrq->completion_notification != 0;
+}
+
+/******************************************************************************/
+
+/* these are purposely avoiding using local vars so they don't increase
+ * stack consumption. */
+
+#define kib_peer_addref(peer) do {                                      \
+        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
+                 atomic_read(&peer->ibp_refcount));                     \
+        CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n",                   \
+               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+        atomic_inc(&peer->ibp_refcount);                                \
+} while (0)
+
+#define kib_peer_decref(peer) do {                                      \
+        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
+                 atomic_read(&peer->ibp_refcount));                     \
+        CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n",                   \
+               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+        if (atomic_dec_and_test (&peer->ibp_refcount)) {                \
+                CDEBUG (D_NET, "destroying peer "LPX64" %p\n",          \
+                        peer->ibp_nid, peer);                           \
+                kibnal_destroy_peer (peer);                             \
+        }                                                               \
+} while (0)
+
+/******************************************************************************/
+
+static inline struct list_head *
+kibnal_nid2peerlist (ptl_nid_t nid) 
+{
+        unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
+        
+        return (&kibnal_data.kib_peers [hash]);
+}
+
+static inline int
+kibnal_peer_active(kib_peer_t *peer)
+{
+        /* Am I in the peer hash table? */
+        return (!list_empty(&peer->ibp_list));
+}
+
+static inline void
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+        /* CAVEAT EMPTOR: tx takes caller's ref on conn */
+
+        LASSERT (tx->tx_nsp > 0);               /* work items set up */
+        LASSERT (tx->tx_conn == NULL);          /* only set here */
+
+        tx->tx_conn = conn;
+        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
+        list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+}
+
+static inline __u64*
+kibnal_service_nid_field(ib_service_record_v2_t *sr)
+{
+        /* The service key mask must have byte 0 to 7 set. */
+        return (__u64 *)sr->service_data8;
+}
+
+static inline void
+kibnal_set_service_keys(ib_service_record_v2_t *sr, ptl_nid_t nid)
+{
+        LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(sr->service_name));
+
+        strcpy (sr->service_name, IBNAL_SERVICE_NAME);
+
+        *kibnal_service_nid_field(sr) = cpu_to_le64(nid);
+}
+
+#if CONFIG_X86
+/* TODO: use vv_va2adverize instead */
+static inline __u64
+kibnal_page2phys (struct page *p)
+{
+        __u64 page_number = p - mem_map;
+        
+        return (page_number << PAGE_SHIFT);
+}
+#else
+# error "no page->phys"
+#endif
+
+/* CAVEAT EMPTOR: We rely on tx/rx descriptor alignment to allow us to
+ * use the lowest bit of the work request id as a flag to determine if
+ * the completion is for a transmit or a receive (the op field is not
+ * valid when the wc completes in error). */
+
+static inline vv_wr_id_t
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+        unsigned long lptr = (unsigned long)ptr;
+
+        LASSERT ((lptr & 1) == 0);
+        return (vv_wr_id_t)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (vv_wr_id_t wreqid)
+{
+        return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (vv_wr_id_t wreqid)
+{
+        return (wreqid & 1) != 0;
+}
+
+static inline int
+kibnal_whole_mem(void)
+{
+#if IBNAL_WHOLE_MEM
+        return true;
+#else
+        return false;
+#endif
+}
+
+/* Voltaire stores GIDs in host order. */
+static inline void gid_swap(vv_gid_t *gid)
+{
+        u_int64_t s;
+
+        s = gid->scope.g.subnet;
+        gid->scope.g.subnet = cpu_to_be64(gid->scope.g.eui64);
+        gid->scope.g.eui64 = cpu_to_be64(s);
+}
+
+#if 0
+static void dump_qp(kib_conn_t *conn)
+{
+        vv_qp_attr_t *qp_attrs;
+        void *qp_context;
+        vv_return_t retval;
+
+        CERROR("QP dumping %p\n", conn); 
+
+        retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs);
+        if (retval) {
+                CERROR ("Couldn't query qp attributes: %d\n", retval);
+                return;
+        }
+
+        qp_attrs = &conn->ibc_qp_attrs;
+
+        CERROR("QP %x dump\n", qp_attrs->query.qp_num);
+        CERROR("  vv_qp_attr_mask = %llx\n", qp_attrs->query.vv_qp_attr_mask);
+        CERROR("  qp_state = %d\n", qp_attrs->query.qp_state);
+        CERROR("  cq_send_h = %p\n", qp_attrs->query.cq_send_h);
+        CERROR("  cq_receive_h = %p \n", qp_attrs->query.cq_receive_h);
+        CERROR("  send_max_outstand_wr = %d\n", qp_attrs->query.send_max_outstand_wr);
+        CERROR("  receive_max_outstand_wr = %d\n", qp_attrs->query.receive_max_outstand_wr);
+        CERROR("  max_scatgat_per_send_wr = %d\n", qp_attrs->query.max_scatgat_per_send_wr);
+        CERROR("  max_scatgat_per_receive_wr = %d\n", qp_attrs->query.max_scatgat_per_receive_wr);
+        CERROR("  send_psn = %x\n", qp_attrs->query.send_psn);
+        CERROR("  receve_psn = %x\n", qp_attrs->query.receve_psn);
+        CERROR("  access_control = %x\n", qp_attrs->query.access_control);
+        CERROR("  phy_port_num = %d\n", qp_attrs->query.phy_port_num);
+        CERROR("  primary_p_key_indx = %x\n", qp_attrs->query.primary_p_key_indx);
+        CERROR("  q_key = %x\n", qp_attrs->query.q_key);
+        CERROR("  destanation_qp = %x\n", qp_attrs->query.destanation_qp);
+        CERROR("  rdma_r_atom_outstand_num = %d\n", qp_attrs->query.rdma_r_atom_outstand_num);
+        CERROR("  responder_rdma_r_atom_num = %d\n", qp_attrs->query.responder_rdma_r_atom_num);
+        CERROR("  min_rnr_nak_timer = %d\n", qp_attrs->query.min_rnr_nak_timer);
+        CERROR("  pd_h = %lx\n", qp_attrs->query.pd_h);
+        CERROR("  recv_solicited_events = %d\n", qp_attrs->query.recv_solicited_events);
+        CERROR("  send_signaled_comp = %d\n", qp_attrs->query.send_signaled_comp);
+        CERROR("  flow_control = %d\n", qp_attrs->query.flow_control);
+}
+#else
+#define dump_qp(a)
+#endif
+
+#if 0
+static void dump_wqe(vv_wr_t *wr)
+{
+        CERROR("Dumping send WR %p\n", wr);
+        
+        CERROR("  wr_id = %llx\n", wr->wr_id);
+        CERROR("  completion_notification = %d\n", wr->completion_notification);
+        CERROR("  scatgat_list = %p\n", wr->scatgat_list);
+        CERROR("  num_of_data_segments = %d\n", wr->num_of_data_segments);
+
+        if (wr->scatgat_list && wr->num_of_data_segments) {
+                CERROR("    scatgat_list[0].v_address = %p\n", wr->scatgat_list[0].v_address);
+                CERROR("    scatgat_list[0].length = %d\n", wr->scatgat_list[0].length);
+                CERROR("    scatgat_list[0].l_key = %x\n", wr->scatgat_list[0].l_key);
+        }
+
+        CERROR("  wr_type = %d\n", wr->wr_type);
+        
+        switch(wr->wr_type) {
+        case vv_wr_send:
+                CERROR("  send\n");
+                
+                CERROR("  fance_indicator = %d\n", wr->type.send.send_qp_type.rc_type.fance_indicator);
+                break;
+                
+        case vv_wr_receive:
+                break;
+
+        case vv_wr_rdma_write:
+        case vv_wr_rdma_read:
+                CERROR("  rdma\n");
+                CERROR("  fance_indicator = %d\n", wr->type.send.send_qp_type.rc_type.fance_indicator);
+                CERROR("  r_addr = %llx\n", wr->type.send.send_qp_type.rc_type.r_addr);
+                CERROR("  r_r_key = %x\n", wr->type.send.send_qp_type.rc_type.r_r_key);
+                break;
+
+        default:
+                break;
+        }
+}
+
+#else
+#define dump_wqe(a)
+#endif
+
+#if 0
+static void dump_wc(vv_wc_t *wc)
+{
+        CERROR("Dumping WC\n");
+
+        CERROR("  wr_id = %llx\n", wc->wr_id);
+        CERROR("  operation_type = %d\n", wc->operation_type);
+        CERROR("  num_bytes_transfered = %lld\n", wc->num_bytes_transfered);
+        CERROR("  completion_status = %d\n", wc->completion_status);
+}
+#else
+#define dump_wc(a)
+#endif
+
+#if 0
+static void hexdump(char *string, void *ptr, int len)
+{
+        unsigned char *c = ptr;
+        int i;
+
+        if (len < 0 || len > 2048)  {
+                printk("XXX what the hell? %d\n",len);
+                return;
+        }
+
+        printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+
+        for (i = 0; i < len;) {
+                printk("%02x",*(c++));
+                i++;
+                if (!(i & 15)) {
+                        printk("\n");
+                } else if (!(i&1)) {
+                        printk(" ");
+                }
+        }
+
+        if(len & 15) {
+                printk("\n");
+        }
+}
+#else
+#define hexdump(a,b,c)
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_destroy_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer, 
+                                              __u64 incarnation);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg);
+
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
+
+extern void kibnal_check_sends (kib_conn_t *conn);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int  kibnal_scheduler(void *arg);
+extern int  kibnal_connd (void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern void kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status, 
+                                      kib_rx_t *rx, lib_msg_t *libmsg, 
+                                      unsigned int niov, 
+                                      struct iovec *iov, ptl_kiov_t *kiov,
+                                      size_t offset, size_t nob);
+
+void kibnal_ca_async_callback(vv_event_record_t ev);
+void kibnal_ca_callback (unsigned long context);
+extern void vibnal_mad_received_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm);
+extern void vibnal_mad_sent_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm);
+extern int kibnal_advertize_op(ptl_nid_t nid, int op, sa_request_cb_t callback, void *context);
+extern int vibnal_start_sa_request(struct sa_request *request);
+extern struct sa_request *alloc_sa_request(void);
+extern void free_sa_request(struct sa_request *request);
+extern int kibnal_pathrecord_op(struct sa_request *request, vv_gid_t dgid, sa_request_cb_t callback, void *context);
diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c

new file mode 100644 (file)

index 0000000..78bcda4
--- /dev/null
+++ b/lnet/klnds/viblnd/viblnd_cb.c
@@ -0,0 +1,3163 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Frank Zago <fzago@systemfabricworks.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "vibnal.h"
+
+static void kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg);
+
+/*
+ *  LIB functions follow
+ *
+ */
+static void
+kibnal_schedule_tx_done (kib_tx_t *tx)
+{
+        unsigned long flags;
+
+        spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
+
+        list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+        wake_up (&kibnal_data.kib_sched_waitq);
+
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
+static void
+kibnal_tx_done (kib_tx_t *tx)
+{
+        ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
+        unsigned long    flags;
+        int              i;
+        vv_return_t retval;
+
+        LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
+        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
+
+        switch (tx->tx_mapped) {
+        default:
+                LBUG();
+
+        case KIB_TX_UNMAPPED:
+                break;
+
+        case KIB_TX_MAPPED:
+                if (in_interrupt()) {
+                        /* can't deregister memory in IRQ context... */
+                        kibnal_schedule_tx_done(tx);
+                        return;
+                }
+                retval = vv_mem_region_destroy(kibnal_data.kib_hca, tx->tx_md.md_handle);
+                LASSERT (retval == vv_return_ok);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+                break;
+
+#if IBNAL_FMR
+        case KIB_TX_MAPPED_FMR:
+                if (in_interrupt() && tx->tx_status != 0) {
+                        /* can't flush FMRs in IRQ context... */
+                        kibnal_schedule_tx_done(tx);
+                        return;
+                }              
+
+                rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
+                LASSERT (rc == 0);
+
+                if (tx->tx_status != 0)
+                        ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+                tx->tx_mapped = KIB_TX_UNMAPPED;
+                break;
+#endif
+        }
+
+        for (i = 0; i < 2; i++) {
+                /* tx may have up to 2 libmsgs to finalise */
+                if (tx->tx_libmsg[i] == NULL)
+                        continue;
+
+                lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+                tx->tx_libmsg[i] = NULL;
+        }
+        
+        if (tx->tx_conn != NULL) {
+                kibnal_put_conn (tx->tx_conn);
+                tx->tx_conn = NULL;
+        }
+
+        tx->tx_nsp = 0;
+        tx->tx_passive_rdma = 0;
+        tx->tx_status = 0;
+
+        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+        if (tx->tx_isnblk) {
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
+        } else {
+                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+                wake_up (&kibnal_data.kib_idle_tx_waitq);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+}
+
+static kib_tx_t *
+kibnal_get_idle_tx (int may_block) 
+{
+        unsigned long  flags;
+        kib_tx_t      *tx = NULL;
+        ENTRY;
+        
+        for (;;) {
+                spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+                /* "normal" descriptor is free */
+                if (!list_empty (&kibnal_data.kib_idle_txs)) {
+                        tx = list_entry (kibnal_data.kib_idle_txs.next,
+                                         kib_tx_t, tx_list);
+                        break;
+                }
+
+                if (!may_block) {
+                        /* may dip into reserve pool */
+                        if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
+                                CERROR ("reserved tx desc pool exhausted\n");
+                                break;
+                        }
+
+                        tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+                                         kib_tx_t, tx_list);
+                        break;
+                }
+
+                /* block for idle tx */
+                spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+                wait_event (kibnal_data.kib_idle_tx_waitq,
+                            !list_empty (&kibnal_data.kib_idle_txs) ||
+                            kibnal_data.kib_shutdown);
+        }
+
+        if (tx != NULL) {
+                list_del (&tx->tx_list);
+
+                /* Allocate a new passive RDMA completion cookie.  It might
+                 * not be needed, but we've got a lock right now and we're
+                 * unlikely to wrap... */
+                tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
+
+                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+                LASSERT (tx->tx_nsp == 0);
+                LASSERT (tx->tx_sending == 0);
+                LASSERT (tx->tx_status == 0);
+                LASSERT (tx->tx_conn == NULL);
+                LASSERT (!tx->tx_passive_rdma);
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_libmsg[0] == NULL);
+                LASSERT (tx->tx_libmsg[1] == NULL);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+        
+        RETURN(tx);
+}
+
+static int
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if kibnal_get_peer (nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->libnal_ni.ni_pid.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+static void
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+{
+        struct list_head *ttmp;
+        unsigned long     flags;
+        int               idle;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (!tx->tx_passive_rdma_wait ||
+                    tx->tx_passive_rdma_cookie != cookie)
+                        continue;
+
+                CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+
+                tx->tx_status = status;
+                tx->tx_passive_rdma_wait = 0;
+                idle = (tx->tx_sending == 0);
+
+                if (idle)
+                        list_del (&tx->tx_list);
+
+                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+                /* I could be racing with tx callbacks.  It's whoever
+                 * _makes_ tx idle that frees it */
+                if (idle)
+                        kibnal_tx_done (tx);
+                return;
+        }
+                
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
+                cookie, conn->ibc_peer->ibp_nid);
+}
+
+static void
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
+{
+        kib_conn_t   *conn = rx->rx_conn;
+        int           rc = 0;
+        unsigned long flags;
+        vv_return_t retval;
+
+        ENTRY;
+        
+        rx->rx_gl = (vv_scatgat_t) {
+                .v_address = (void *)rx->rx_msg,
+                .length    = IBNAL_MSG_SIZE,
+                .l_key     = rx->l_key,
+        };
+
+        rx->rx_wrq = (vv_wr_t) {
+                .wr_id                   = kibnal_ptr2wreqid(rx, 1),
+                .completion_notification = 1,
+                .scatgat_list            = &rx->rx_gl,
+                .num_of_data_segments    = 1,
+                .wr_type                 = vv_wr_receive,
+        };
+
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
+                                    IBNAL_CONN_DREP);
+        LASSERT (!rx->rx_posted);
+        rx->rx_posted = 1;
+        mb();
+
+        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+                rc = -ECONNABORTED;
+        else {
+                retval = vv_post_receive(kibnal_data.kib_hca, conn->ibc_qp, &rx->rx_wrq);
+
+                if (retval) {
+                        CDEBUG(D_NET, "post failed %d\n", retval);
+                        rc = -EINVAL;
+                }
+                CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+        }
+
+        if (rc == 0) {
+                if (do_credits) {
+                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        conn->ibc_outstanding_credits++;
+                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+                        kibnal_check_sends(conn);
+                }
+                EXIT;
+                return;
+        }
+
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                CERROR ("Error posting receive -> "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, rc);
+                kibnal_close_conn (rx->rx_conn, rc);
+        } else {
+                CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, rc);
+        }
+
+        /* Drop rx's ref */
+        kibnal_put_conn (conn);
+        EXIT;
+}
+
+#if IBNAL_CKSUM
+static inline __u32 kibnal_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+        
+        return (sum);
+}
+#endif
+
+static void
+kibnal_rx_callback (vv_wc_t *wc)
+{
+        kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->wr_id);
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+        int           nob = wc->num_bytes_transfered;
+        const int     base_nob = offsetof(kib_msg_t, ibm_u);
+        int           credits;
+        int           flipped;
+        unsigned long flags;
+        __u32         i;
+#if IBNAL_CKSUM
+        __u32         msg_cksum;
+        __u32         computed_cksum;
+#endif
+
+        /* we set the QP to erroring after we've finished disconnecting, 
+         * maybe we should do so sooner. */
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, 
+                                    IBNAL_CONN_DISCONNECTED);
+
+        CDEBUG(D_NET, "rx %p conn %p, nob=%d\n", rx, conn, nob);
+
+        LASSERT (rx->rx_posted);
+        rx->rx_posted = 0;
+        mb();
+
+        /* receives complete with error in any case after we've started
+         * disconnecting */
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+                goto failed;
+
+        if (wc->completion_status != vv_comp_status_success) {
+                CERROR("Rx from "LPX64" failed: %d\n", 
+                       conn->ibc_peer->ibp_nid, wc->completion_status);
+                goto failed;
+        }
+
+        if (nob < base_nob) {
+                CERROR ("Short rx from "LPX64": %d < expected %d\n",
+                        conn->ibc_peer->ibp_nid, nob, base_nob);
+                goto failed;
+        }
+
+        /* Receiver does any byte flipping if necessary... */
+
+        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+                flipped = 0;
+        } else {
+                if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+                        CERROR ("Unrecognised magic: %08x from "LPX64"\n", 
+                                msg->ibm_magic, conn->ibc_peer->ibp_nid);
+                        goto failed;
+                }
+                flipped = 1;
+                __swab16s (&msg->ibm_version);
+                LASSERT (sizeof(msg->ibm_type) == 1);
+                LASSERT (sizeof(msg->ibm_credits) == 1);
+        }
+
+        if (msg->ibm_version != IBNAL_MSG_VERSION) {
+                CERROR ("Incompatible msg version %d (%d expected)\n",
+                        msg->ibm_version, IBNAL_MSG_VERSION);
+                goto failed;
+        }
+
+#if IBNAL_CKSUM
+        if (nob != msg->ibm_nob) {
+                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
+                goto failed;
+        }
+
+        msg_cksum = le32_to_cpu(msg->ibm_cksum);
+        msg->ibm_cksum = 0;
+        computed_cksum = kibnal_cksum (msg, nob);
+        
+        if (msg_cksum != computed_cksum) {
+                CERROR ("Checksum failure %d: (%d expected)\n",
+                        computed_cksum, msg_cksum);
+//                goto failed;
+        }
+        CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
+#endif
+
+        /* Have I received credits that will let me send? */
+        credits = msg->ibm_credits;
+        if (credits != 0) {
+                spin_lock_irqsave(&conn->ibc_lock, flags);
+                conn->ibc_credits += credits;
+                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                
+                kibnal_check_sends(conn);
+        }
+
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_NOOP:
+                kibnal_post_rx (rx, 1);
+                return;
+
+        case IBNAL_MSG_IMMEDIATE:
+                if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
+                        CERROR ("Short IMMEDIATE from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                break;
+                
+        case IBNAL_MSG_PUT_RDMA:
+        case IBNAL_MSG_GET_RDMA:
+                if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
+                        CERROR ("Short RDMA msg from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                if (flipped) 
+                        __swab32(msg->ibm_u.rdma.ibrm_num_descs);
+
+                CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
+                       msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
+
+                if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) ||
+                    (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > 
+                     min(nob, IBNAL_MSG_SIZE))) {
+                        CERROR ("num_descs %d too large\n", 
+                                msg->ibm_u.rdma.ibrm_num_descs);
+                        goto failed;
+                }
+
+                if (flipped) {
+                        __swab32(msg->ibm_u.rdma.rd_key);
+                }
+
+                for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
+                        kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
+
+                        if (flipped) {
+                                __swab32(desc->rd_nob);
+                                __swab64(desc->rd_addr);
+                        }
+
+                        CDEBUG(D_NET, "  key %x, " "addr "LPX64", nob %u\n",
+                               msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob);
+                }
+                break;
+                        
+        case IBNAL_MSG_PUT_DONE:
+        case IBNAL_MSG_GET_DONE:
+                if (nob < base_nob + sizeof (kib_completion_msg_t)) {
+                        CERROR ("Short COMPLETION msg from "LPX64": %d\n",
+                                conn->ibc_peer->ibp_nid, nob);
+                        goto failed;
+                }
+                if (flipped)
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
+                
+                CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
+                       msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+                       msg->ibm_u.completion.ibcm_status);
+
+                kibnal_complete_passive_rdma (conn, 
+                                              msg->ibm_u.completion.ibcm_cookie,
+                                              msg->ibm_u.completion.ibcm_status);
+                kibnal_post_rx (rx, 1);
+                return;
+                        
+        default:
+                CERROR ("Can't parse type from "LPX64": %d\n",
+                        conn->ibc_peer->ibp_nid, msg->ibm_type);
+                goto failed;
+        }
+
+        /* schedule for kibnal_rx() in thread context */
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+        
+        list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+        wake_up (&kibnal_data.kib_sched_waitq);
+        
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+
+        return;
+        
+ failed:
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+        kibnal_close_conn(conn, -ECONNABORTED);
+
+        /* Don't re-post rx & drop its ref on conn */
+        kibnal_put_conn(conn);
+}
+
+static void
+kibnal_rx (kib_rx_t *rx)
+{
+        kib_msg_t   *msg = rx->rx_msg;
+
+        /* Clear flag so I can detect if I've sent an RDMA completion */
+        rx->rx_rdma = 0;
+
+        switch (msg->ibm_type) {
+        case IBNAL_MSG_GET_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+                /* If the incoming get was matched, I'll have initiated the
+                 * RDMA and the completion message... */
+                if (rx->rx_rdma)
+                        break;
+
+                /* Otherwise, I'll send a failed completion now to prevent
+                 * the peer's GET blocking for the full timeout. */
+                CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+                                          rx, NULL, 0, NULL, NULL, 0, 0);
+                break;
+                
+        case IBNAL_MSG_PUT_RDMA:
+                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+                if (rx->rx_rdma)
+                        break;
+                /* This is most unusual, since even if lib_parse() didn't
+                 * match anything, it should have asked us to read (and
+                 * discard) the payload.  The portals header must be
+                 * inconsistent with this message type, so it's the
+                 * sender's fault for sending garbage and she can time
+                 * herself out... */
+                CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                break;
+
+        case IBNAL_MSG_IMMEDIATE:
+                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
+                LASSERT (!rx->rx_rdma);
+                break;
+                
+        default:
+                LBUG();
+                break;
+        }
+
+        kibnal_post_rx (rx, 1);
+}
+
+static struct page *
+kibnal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END)
+                page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+        else if (vaddr >= PKMAP_BASE &&
+                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+                page = vmalloc_to_page ((void *)vaddr);
+        /* in 2.4 ^ just walks the page tables */
+#endif
+        else
+                page = virt_to_page (vaddr);
+
+        if (!VALID_PAGE (page))
+                page = NULL;
+
+        return page;
+}
+
+static void
+kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
+                 unsigned long len, int active)
+{
+        kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
+        kib_rdma_desc_t *desc;
+        vv_l_key_t l_key;
+        vv_r_key_t r_key;
+        void *addr;
+        vv_mem_reg_h_t mem_h;
+        vv_return_t retval;
+
+        LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", 
+                 ibrm->ibrm_num_descs);
+
+        desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
+
+        addr = page_address(page) + page_offset;
+
+        /* TODO: This next step is only needed to get either the lkey
+         * or the rkey. However they should be the same than for the
+         * tx buffer, so we might as well use it. */
+        retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+                                      addr,
+                                      len,
+                                      &mem_h,
+                                      &l_key,
+                                      &r_key);
+        if (retval) {
+                CERROR("vv_get_gen_mr_attrib failed: %d", retval);
+                /* TODO: this shouldn't really fail, but what if? */
+                return;
+        }
+
+        if (active) {
+                ibrm->rd_key = l_key;
+        } else {
+                ibrm->rd_key = r_key;
+
+                vv_va2advertise_addr(kibnal_data.kib_hca, addr, &addr);
+        }
+
+        desc->rd_addr = (__u64)(unsigned long)addr;
+        desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
+
+        ibrm->ibrm_num_descs++;
+}
+
+static int
+kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
+{
+        struct page *page;
+        int page_offset, len;
+
+        while (nob > 0) {
+                page = kibnal_kvaddr_to_page(vaddr);
+                if (page == NULL)
+                        return -EFAULT;
+
+                page_offset = vaddr & (PAGE_SIZE - 1);
+                len = min(nob, (int)PAGE_SIZE - page_offset);
+                
+                kibnal_fill_ibrm(tx, page, page_offset, len, active);
+                nob -= len;
+                vaddr += len;
+        }
+
+        return 0;
+}
+
+static int
+kibnal_map_iov (kib_tx_t *tx, vv_access_con_bit_mask_t access,
+                 int niov, struct iovec *iov, int offset, int nob, int active)
+                 
+{
+        void   *vaddr;
+        vv_return_t retval;
+
+        LASSERT (nob > 0);
+        LASSERT (niov > 0);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+                LASSERT (niov > 0);
+        }
+
+        if (nob > iov->iov_len - offset) {
+                CERROR ("Can't map multiple vaddr fragments\n");
+                return (-EMSGSIZE);
+        }
+
+        /* our large contiguous iov could be backed by multiple physical
+         * pages. */
+        if (kibnal_whole_mem()) {
+                int rc;
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+                rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + 
+                                         offset, nob, active);
+                if (rc != 0) {
+                        CERROR ("Can't map iov: %d\n", rc);
+                        return rc;
+                }
+                return 0;
+        }
+
+        vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
+        tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+
+        retval = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob,
+                                   kibnal_data.kib_pd, access,
+                                   &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
+                                   &tx->tx_md.md_rkey);
+        if (retval != 0) {
+                CERROR ("Can't map vaddr %p: %d\n", vaddr, retval);
+                return -EINVAL;
+        }
+
+        tx->tx_mapped = KIB_TX_MAPPED;
+        return (0);
+}
+
+static int
+kibnal_map_kiov (kib_tx_t *tx, vv_access_con_bit_mask_t access,
+                  int nkiov, ptl_kiov_t *kiov,
+                  int offset, int nob, int active)
+{
+        vv_phy_list_t  phys_pages;
+        vv_phy_buf_t  *phys_buf = NULL;
+        int            page_offset;
+        int            nphys;
+        int            resid;
+        int            phys_size = 0;
+        int            i, rc = 0;
+        vv_return_t    retval;
+
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
+        }
+
+        page_offset = kiov->kiov_offset + offset;
+        nphys = 1;
+
+        if (!kibnal_whole_mem()) {
+                phys_size = nkiov * sizeof(vv_phy_buf_t);
+                PORTAL_ALLOC(phys_buf, phys_size);
+
+                if (phys_buf == NULL) {
+                        CERROR ("Can't allocate phys_buf\n");
+                        return (-ENOMEM);
+                }
+
+                phys_buf[0].start = kibnal_page2phys(kiov->kiov_page);
+                phys_buf[0].size = PAGE_SIZE;
+
+        } else {
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+                kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, 
+                                 kiov->kiov_len, active);
+        }
+
+        resid = nob - (kiov->kiov_len - offset);
+
+        while (resid > 0) {
+                kiov++;
+                nkiov--;
+                LASSERT (nkiov > 0);
+
+                if (kiov->kiov_offset != 0 ||
+                    ((resid > PAGE_SIZE) && 
+                     kiov->kiov_len < PAGE_SIZE)) {
+                        /* Can't have gaps */
+                        CERROR ("Can't make payload contiguous in I/O VM:"
+                                "page %d, offset %d, len %d \n", nphys, 
+                                kiov->kiov_offset, kiov->kiov_len);
+
+                        for (i = -nphys; i < nkiov; i++) 
+                        {
+                                CERROR("kiov[%d] %p +%d for %d\n",
+                                       i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
+                        }
+                        
+                        rc = -EINVAL;
+                        goto out;
+                }
+
+                if (nphys == PTL_MD_MAX_IOV) {
+                        CERROR ("payload too big (%d)\n", nphys);
+                        rc = -EMSGSIZE;
+                        goto out;
+                }
+
+                if (!kibnal_whole_mem()) {
+                        LASSERT (nphys * sizeof (vv_phy_buf_t) < phys_size);
+                        phys_buf[nphys].start = kibnal_page2phys(kiov->kiov_page);
+                        phys_buf[nphys].size = PAGE_SIZE;
+
+                } else {
+                        if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
+                                CERROR ("payload too big (%d)\n", nphys);
+                                rc = -EMSGSIZE;
+                                goto out;
+                        }
+                        kibnal_fill_ibrm(tx, kiov->kiov_page, 
+                                         kiov->kiov_offset, kiov->kiov_len,
+                                         active);
+                }
+
+                nphys ++;
+                resid -= PAGE_SIZE;
+        }
+
+        if (kibnal_whole_mem())
+                goto out;
+
+#if 0
+        CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
+        for (i = 0; i < nphys; i++)
+                CWARN ("   [%d] "LPX64"\n", i, phys[i]);
+#endif
+
+#if IBNAL_FMR
+#error "vibnal hasn't learned about FMR yet"
+        rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
+                                       phys_pages, nphys,
+                                       &tx->tx_md.md_addr,
+                                       page_offset,
+                                       &tx->tx_md.md_handle.fmr,
+                                       &tx->tx_md.md_lkey,
+                                       &tx->tx_md.md_rkey);
+#else
+        retval = vv_phy_mem_region_register(kibnal_data.kib_hca,
+                                            &phys_pages,
+                                            IBNAL_RDMA_BASE,
+                                            nphys,
+                                            0,          /* offset */
+                                            kibnal_data.kib_pd,
+                                            vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind, /* TODO: translated as-is, but seems incorrect or too much */
+                                            &tx->tx_md.md_handle,
+                                            &tx->tx_md.md_addr,
+                                            &tx->tx_md.md_lkey,
+                                            &tx->tx_md.md_rkey);
+#endif
+        if (retval == vv_return_ok) {
+                CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
+                       nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
+#if IBNAL_FMR
+                tx->tx_mapped = KIB_TX_MAPPED_FMR;
+#else
+                tx->tx_mapped = KIB_TX_MAPPED;
+#endif
+        } else {
+                CERROR ("Can't map phys_pages: %d\n", retval);
+                rc = -EFAULT;
+        }
+
+ out:
+        if (phys_buf != NULL)
+                PORTAL_FREE(phys_buf, phys_size);
+
+        return (rc);
+}
+
+static kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
+{
+        struct list_head *tmp;
+
+        /* just return the first connection */
+        list_for_each (tmp, &peer->ibp_conns) {
+                return (list_entry(tmp, kib_conn_t, ibc_list));
+        }
+
+        return (NULL);
+}
+
+void
+kibnal_check_sends (kib_conn_t *conn)
+{
+        unsigned long   flags;
+        kib_tx_t       *tx;
+        int             rc;
+        int             i;
+        int             done;
+        int             nwork;
+
+        ENTRY;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
+        if (list_empty(&conn->ibc_tx_queue) &&
+            conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
+                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                
+                tx = kibnal_get_idle_tx(0);     /* don't block */
+                if (tx != NULL)
+                        kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
+
+                spin_lock_irqsave(&conn->ibc_lock, flags);
+                
+                if (tx != NULL) {
+                        atomic_inc(&conn->ibc_refcount);
+                        kibnal_queue_tx_locked(tx, conn);
+                }
+        }
+
+        while (!list_empty (&conn->ibc_tx_queue)) {
+                tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+
+                /* We rely on this for QP sizing */
+                LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+
+                LASSERT (conn->ibc_outstanding_credits >= 0);
+                LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_credits >= 0);
+                LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
+
+                /* Not on ibc_rdma_queue */
+                LASSERT (!tx->tx_passive_rdma_wait);
+
+                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
+                        GOTO(out, 0);
+
+                if (conn->ibc_credits == 0)     /* no credits */
+                        GOTO(out, 1);
+                
+                if (conn->ibc_credits == 1 &&   /* last credit reserved for */
+                    conn->ibc_outstanding_credits == 0) /* giving back credits */
+                        GOTO(out, 2);
+
+                list_del (&tx->tx_list);
+
+                if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
+                    (!list_empty(&conn->ibc_tx_queue) ||
+                     conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+                        /* redundant NOOP */
+                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                        kibnal_tx_done(tx);
+                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        continue;
+                }
+
+                tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
+                conn->ibc_outstanding_credits = 0;
+
+                conn->ibc_nsends_posted++;
+                conn->ibc_credits--;
+
+                /* we only get a tx completion for the final rdma op */ 
+                tx->tx_sending = 0;
+                tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+                list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+                tx->tx_msg->ibm_cksum = 0;
+                tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
+#endif
+                /* NB the gap between removing tx from the queue and sending it
+                 * allows message re-ordering to occur */
+
+                LASSERT (tx->tx_nsp > 0);
+
+                rc = -ECONNABORTED;
+                nwork = 0;
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                        vv_return_t retval;                        
+
+                        tx->tx_status = 0;
+                        rc = 0;
+
+                        retval = vv_post_send_list(kibnal_data.kib_hca, conn->ibc_qp, tx->tx_nsp, tx->tx_wrq, vv_operation_type_send_rc);
+
+                        if (retval != 0) {
+                                CERROR("post send failed with %d\n", retval);
+                                rc = -ECONNABORTED;
+                                break;
+                        }
+                        
+                        tx->tx_sending = tx->tx_nsp;
+                }
+
+                if (rc != 0) {
+                        /* NB credits are transferred in the actual
+                         * message, which can only be the last work item */
+                        conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
+                        conn->ibc_credits++;
+                        conn->ibc_nsends_posted--;
+
+                        tx->tx_status = rc;
+                        tx->tx_passive_rdma_wait = 0;
+
+                        /* TODO: I think this is buggy if vv_post_send_list failed. */
+                        done = (tx->tx_sending == 0);
+                        if (done)
+                                list_del (&tx->tx_list);
+                        
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        
+                        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+                                CERROR ("Error %d posting transmit to "LPX64"\n", 
+                                        rc, conn->ibc_peer->ibp_nid);
+                        else
+                                CDEBUG (D_NET, "Error %d posting transmit to "
+                                        LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+
+                        kibnal_close_conn (conn, rc);
+
+                        if (done)
+                                kibnal_tx_done (tx);
+                        return;
+                }
+                
+        }
+
+        EXIT;
+out:
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+}
+
+static void
+kibnal_tx_callback (vv_wc_t *wc)
+{
+        kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->wr_id);
+        kib_conn_t   *conn;
+        unsigned long flags;
+        int           idle;
+
+        conn = tx->tx_conn;
+        LASSERT (conn != NULL);
+        LASSERT (tx->tx_sending != 0);
+
+        CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
+               tx->tx_sending, tx->tx_nsp, wc->completion_status);
+
+        spin_lock_irqsave(&conn->ibc_lock, flags);
+
+        /* I could be racing with rdma completion.  Whoever makes 'tx' idle
+         * gets to free it, which also drops its ref on 'conn'.  If it's
+         * not me, then I take an extra ref on conn so it can't disappear
+         * under me. */
+
+        tx->tx_sending--;
+        idle = (tx->tx_sending == 0) &&         /* This is the final callback */
+                (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
+        if (idle)
+                list_del(&tx->tx_list);
+
+        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+               atomic_read (&conn->ibc_refcount));
+        atomic_inc (&conn->ibc_refcount);
+
+        if (tx->tx_sending == 0)
+                conn->ibc_nsends_posted--;
+
+        if (wc->completion_status != vv_comp_status_success &&
+            tx->tx_status == 0)
+                tx->tx_status = -ECONNABORTED;
+
+        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+        if (idle)
+                kibnal_tx_done (tx);
+
+        if (wc->completion_status != vv_comp_status_success) {
+                CERROR ("Tx completion to "LPX64" failed: %d\n", 
+                        conn->ibc_peer->ibp_nid, wc->completion_status);
+                kibnal_close_conn (conn, -ENETDOWN);
+        } else {
+                /* can I shovel some more sends out the door? */
+                kibnal_check_sends(conn);
+        }
+
+        kibnal_put_conn (conn);
+}
+
+void 
+kibnal_ca_async_callback(vv_event_record_t ev)
+{
+        /* XXX flesh out.  this seems largely for async errors */
+        CERROR("type: %d, port: %d, data: "LPX64"\n", ev.event_type, ev.port_num, ev.type.data);
+}
+
+void
+kibnal_ca_callback (unsigned long unused_context)
+{
+        vv_wc_t wc;
+        int armed = 0;
+        vv_return_t retval;
+
+        for(;;) {
+
+                while (vv_poll_for_completion(kibnal_data.kib_hca, kibnal_data.kib_cq, &wc) == vv_return_ok) {
+
+                        /* We will need to rearm the CQ to avoid a potential race. */
+                        armed = 0;
+
+                        if (kibnal_wreqid_is_rx(wc.wr_id))
+                                kibnal_rx_callback(&wc);
+                        else
+                                kibnal_tx_callback(&wc);
+                }
+
+                if (armed)
+                        return;
+                
+                retval = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event);
+                if (retval != 0) {
+                        CERROR ("Failed to re-arm completion queue: %d\n", retval);
+                        return;
+                }
+
+                armed = 1;
+        }
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
+{
+        vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nsp];
+        vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nsp];
+        int           fence;
+        int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+
+        LASSERT (tx->tx_nsp >= 0 && 
+                 tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+        LASSERT (nob <= IBNAL_MSG_SIZE);
+        
+        tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+        tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+        tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+        tx->tx_msg->ibm_nob = nob;
+#endif
+        /* Fence the message if it's bundled with an RDMA read */
+        fence = (tx->tx_nsp > 0) &&
+                (type == IBNAL_MSG_PUT_DONE);
+
+        *gl = (vv_scatgat_t) {
+                .v_address = (void *)tx->tx_msg,
+                .length    = nob,
+                .l_key     = tx->l_key,
+        };
+
+        wrq->wr_id =  kibnal_ptr2wreqid(tx, 0);
+        wrq->completion_notification = 1;
+        wrq->scatgat_list = gl;
+        wrq->num_of_data_segments = 1;
+        wrq->wr_type = vv_wr_send;
+
+        wrq->type.send.solicited_event = 1;
+
+        wrq->type.send.send_qp_type.rc_type.fance_indicator = fence;
+
+        tx->tx_nsp++;
+}
+
+static void
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+        unsigned long         flags;
+
+        spin_lock_irqsave(&conn->ibc_lock, flags);
+
+        kibnal_queue_tx_locked (tx, conn);
+        
+        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+        
+        kibnal_check_sends(conn);
+}
+
+static void
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
+{
+        unsigned long    flags;
+        kib_peer_t      *peer;
+        kib_conn_t      *conn;
+        rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
+
+        /* If I get here, I've committed to send, so I complete the tx with
+         * failure on any problems */
+        
+        LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
+        LASSERT (tx->tx_nsp > 0);               /* work items have been set up */
+
+        read_lock (g_lock);
+        
+        peer = kibnal_find_peer_locked (nid);
+        if (peer == NULL) {
+                read_unlock (g_lock);
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+                return;
+        }
+
+        conn = kibnal_find_conn_locked (peer);
+        if (conn != NULL) {
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                read_unlock (g_lock);
+                
+                kibnal_queue_tx (tx, conn);
+                return;
+        }
+        
+        /* Making one or more connections; I'll need a write lock... */
+        read_unlock (g_lock);
+        write_lock_irqsave (g_lock, flags);
+
+        peer = kibnal_find_peer_locked (nid);
+        if (peer == NULL) {
+                write_unlock_irqrestore (g_lock, flags);
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+                return;
+        }
+
+        conn = kibnal_find_conn_locked (peer);
+        if (conn != NULL) {
+                /* Connection exists; queue message on it */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                write_unlock_irqrestore (g_lock, flags);
+                
+                kibnal_queue_tx (tx, conn);
+                return;
+        }
+
+        if (peer->ibp_connecting == 0) {
+                if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
+                        write_unlock_irqrestore (g_lock, flags);
+                        tx->tx_status = -EHOSTUNREACH;
+                        kibnal_tx_done (tx);
+                        return;
+                }
+        
+                peer->ibp_connecting = 1;
+
+                kib_peer_addref(peer); /* extra ref for connd */
+        
+                spin_lock (&kibnal_data.kib_connd_lock);
+        
+                list_add_tail (&peer->ibp_connd_list,
+                               &kibnal_data.kib_connd_peers);
+                wake_up (&kibnal_data.kib_connd_waitq);
+        
+                spin_unlock (&kibnal_data.kib_connd_lock);
+        }
+        
+        /* A connection is being established; queue the message... */
+        list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
+
+        write_unlock_irqrestore (g_lock, flags);
+}
+
+static ptl_err_t
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
+                            lib_msg_t *libmsg, ptl_hdr_t *hdr)
+{
+        int         nob = libmsg->md->length;
+        kib_tx_t   *tx;
+        kib_msg_t  *ibmsg;
+        int         rc;
+        vv_access_con_bit_mask_t access;
+        
+        LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
+        LASSERT (nob > 0);
+        LASSERT (!in_interrupt());              /* Mapping could block */
+
+        access = vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind;
+
+        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
+        LASSERT (tx != NULL);
+
+        if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
+                rc = kibnal_map_iov (tx, access,
+                                     libmsg->md->md_niov,
+                                     libmsg->md->md_iov.iov,
+                                     0, nob, 0);
+        else
+                rc = kibnal_map_kiov (tx, access,
+                                      libmsg->md->md_niov, 
+                                      libmsg->md->md_iov.kiov,
+                                      0, nob, 0);
+
+        if (rc != 0) {
+                CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
+                goto failed;
+        }
+        
+        if (type == IBNAL_MSG_GET_RDMA) {
+                /* reply gets finalized when tx completes */
+                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, 
+                                                        nid, libmsg);
+                if (tx->tx_libmsg[1] == NULL) {
+                        CERROR ("Can't create reply for GET -> "LPX64"\n",
+                                nid);
+                        rc = -ENOMEM;
+                        goto failed;
+                }
+        }
+        
+        tx->tx_passive_rdma = 1;
+
+        ibmsg = tx->tx_msg;
+
+        ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+        ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+        /* map_kiov alrady filled the rdma descs for the whole_mem case */
+        if (!kibnal_whole_mem()) {
+                ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey;
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+                ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
+        }
+
+        kibnal_init_tx_msg (tx, type, 
+                            kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
+
+        CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
+               LPX64", nob %d\n",
+               tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
+               tx->tx_md.md_addr, nob);
+        
+        /* libmsg gets finalized when tx completes. */
+        tx->tx_libmsg[0] = libmsg;
+
+        kibnal_launch_tx(tx, nid);
+        return (PTL_OK);
+
+ failed:
+        tx->tx_status = rc;
+        kibnal_tx_done (tx);
+        return (PTL_FAIL);
+}
+
+void
+kibnal_start_active_rdma (int type, int status,
+                           kib_rx_t *rx, lib_msg_t *libmsg, 
+                           unsigned int niov,
+                           struct iovec *iov, ptl_kiov_t *kiov,
+                           size_t offset, size_t nob)
+{
+        kib_msg_t    *rxmsg = rx->rx_msg;
+        kib_msg_t    *txmsg;
+        kib_tx_t     *tx;
+        vv_access_con_bit_mask_t access;
+        vv_wr_operation_t rdma_op;
+        int           rc;
+        __u32         i;
+
+        CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
+               type, status, niov, offset, nob);
+
+        /* Called by scheduler */
+        LASSERT (!in_interrupt ());
+
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+
+        /* No data if we're completing with failure */
+        LASSERT (status == 0 || nob == 0);
+
+        LASSERT (type == IBNAL_MSG_GET_DONE ||
+                 type == IBNAL_MSG_PUT_DONE);
+
+        /* Flag I'm completing the RDMA.  Even if I fail to send the
+         * completion message, I will have tried my best so further
+         * attempts shouldn't be tried. */
+        LASSERT (!rx->rx_rdma);
+        rx->rx_rdma = 1;
+
+        if (type == IBNAL_MSG_GET_DONE) {
+                access = 0;
+                rdma_op  = vv_wr_rdma_write;
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
+        } else {
+                access = vv_acc_l_mem_write;
+                rdma_op  = vv_wr_rdma_read;
+                LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
+        }
+
+        tx = kibnal_get_idle_tx (0);           /* Mustn't block */
+        if (tx == NULL) {
+                CERROR ("tx descs exhausted on RDMA from "LPX64
+                        " completing locally with failure\n",
+                        rx->rx_conn->ibc_peer->ibp_nid);
+                lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+                return;
+        }
+        LASSERT (tx->tx_nsp == 0);
+
+        if (nob == 0) 
+                GOTO(init_tx, 0);
+
+        /* We actually need to transfer some data (the transfer
+         * size could get truncated to zero when the incoming
+         * message is matched) */
+        if (kiov != NULL)
+                rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
+        else
+                rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
+        
+        if (rc != 0) {
+                CERROR ("Can't map RDMA -> "LPX64": %d\n", 
+                        rx->rx_conn->ibc_peer->ibp_nid, rc);
+                /* We'll skip the RDMA and complete with failure. */
+                status = rc;
+                nob = 0;
+                GOTO(init_tx, rc);
+        } 
+
+        if (!kibnal_whole_mem()) {
+                tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey;
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
+        }
+
+        /* XXX ugh.  different page-sized hosts. */ 
+        if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
+            rxmsg->ibm_u.rdma.ibrm_num_descs) {
+                CERROR("tx descs (%u) != rx descs (%u)\n", 
+                       tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
+                       rxmsg->ibm_u.rdma.ibrm_num_descs);
+                /* We'll skip the RDMA and complete with failure. */
+                status = rc;
+                nob = 0;
+                GOTO(init_tx, rc);
+        }
+
+        /* map_kiov filled in the rdma descs which describe our side of the
+         * rdma transfer. */
+        /* ibrm_num_descs was verified in rx_callback */
+        for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
+                kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
+                vv_scatgat_t *ds = &tx->tx_gl[i];
+                vv_wr_t *wrq = &tx->tx_wrq[i];
+
+                ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
+                rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
+
+                ds->v_address = (void *)(unsigned long)ldesc->rd_addr;
+                ds->length    = ldesc->rd_nob;
+                ds->l_key     = tx->tx_msg->ibm_u.rdma.rd_key;
+
+                wrq->wr_id = kibnal_ptr2wreqid(tx, 0);
+
+#if 0
+                /* only the last rdma post triggers tx completion */
+                if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
+                        wrq->completion_notification = 1;
+                else
+                        wrq->completion_notification = 0;
+
+#else
+                /* TODO: hack. Right now complete everything, else the
+                 * driver will deadlock. This is less efficient than
+                 * requestion a notification for only a few of the
+                 * WQE. */
+                wrq->completion_notification = 1;
+#endif
+
+                wrq->scatgat_list = ds;
+                wrq->num_of_data_segments = 1;
+                wrq->wr_type = rdma_op;
+
+                wrq->type.send.solicited_event = 0;
+
+                wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
+                wrq->type.send.send_qp_type.rc_type.r_addr = rdesc->rd_addr;
+                wrq->type.send.send_qp_type.rc_type.r_r_key = rxmsg->ibm_u.rdma.rd_key;
+
+                CDEBUG(D_NET, "prepared RDMA with r_addr=%llx r_key=%x\n",
+                       wrq->type.send.send_qp_type.rc_type.r_addr,
+                       wrq->type.send.send_qp_type.rc_type.r_r_key);
+
+                tx->tx_nsp++;
+        }
+
+init_tx:
+        txmsg = tx->tx_msg;
+
+        txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+        txmsg->ibm_u.completion.ibcm_status = status;
+        
+        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+
+        if (status == 0 && nob != 0) {
+                LASSERT (tx->tx_nsp > 1);
+                /* RDMA: libmsg gets finalized when the tx completes.  This
+                 * is after the completion message has been sent, which in
+                 * turn is after the RDMA has finished. */
+                tx->tx_libmsg[0] = libmsg;
+        } else {
+                LASSERT (tx->tx_nsp == 1);
+                /* No RDMA: local completion happens now! */
+                CDEBUG(D_WARNING,"No data: immediate completion\n");
+                lib_finalize (&kibnal_lib, NULL, libmsg,
+                              status == 0 ? PTL_OK : PTL_FAIL);
+        }
+
+        /* +1 ref for this tx... */
+        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+               rx->rx_conn, rx->rx_conn->ibc_state, 
+               rx->rx_conn->ibc_peer->ibp_nid,
+               atomic_read (&rx->rx_conn->ibc_refcount));
+        atomic_inc (&rx->rx_conn->ibc_refcount);
+        /* ...and queue it up */
+        kibnal_queue_tx(tx, rx->rx_conn);
+}
+
+static ptl_err_t
+kibnal_sendmsg(lib_nal_t    *nal, 
+                void         *private,
+                lib_msg_t    *libmsg,
+                ptl_hdr_t    *hdr, 
+                int           type, 
+                ptl_nid_t     nid, 
+                ptl_pid_t     pid,
+                unsigned int  payload_niov, 
+                struct iovec *payload_iov, 
+                ptl_kiov_t   *payload_kiov,
+                size_t        payload_offset,
+                size_t        payload_nob)
+{
+        kib_msg_t  *ibmsg;
+        kib_tx_t   *tx;
+        int         nob;
+
+        /* NB 'private' is different depending on what we're sending.... */
+
+        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
+               " pid %d\n", payload_nob, payload_niov, nid , pid);
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        /* Thread context if we're sending payload */
+        LASSERT (!in_interrupt() || payload_niov == 0);
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+        switch (type) {
+        default:
+                LBUG();
+                return (PTL_FAIL);
+                
+        case PTL_MSG_REPLY: {
+                /* reply's 'private' is the incoming receive */
+                kib_rx_t *rx = private;
+
+                /* RDMA reply expected? */
+                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+                                                 rx, libmsg, payload_niov, 
+                                                 payload_iov, payload_kiov,
+                                                 payload_offset, payload_nob);
+                        return (PTL_OK);
+                }
+                
+                /* Incoming message consistent with immediate reply? */
+                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
+                        CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
+                                nid, rx->rx_msg->ibm_type);
+                        return (PTL_FAIL);
+                }
+
+                /* Will it fit in a message? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob > IBNAL_MSG_SIZE) {
+                        CERROR("REPLY for "LPX64" too big (RDMA not requested): %d (max for message is %d)\n", 
+                               nid, payload_nob, IBNAL_MSG_SIZE);
+                        return (PTL_FAIL);
+                }
+                break;
+        }
+
+        case PTL_MSG_GET:
+                /* might the REPLY message be big enough to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
+                                                          nid, libmsg, hdr));
+                break;
+
+        case PTL_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
+
+        case PTL_MSG_PUT:
+                /* Is the payload big enough to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob > IBNAL_MSG_SIZE)
+                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+                                                          nid, libmsg, hdr));
+                
+                break;
+        }
+
+        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt()));
+        if (tx == NULL) {
+                CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
+                        type, nid, in_interrupt() ? " (intr)" : "");
+                return (PTL_NO_SPACE);
+        }
+
+        ibmsg = tx->tx_msg;
+        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+        if (payload_nob > 0) {
+                if (payload_kiov != NULL)
+                        lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+                                          payload_niov, payload_kiov,
+                                          payload_offset, payload_nob);
+                else
+                        lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+                                         payload_niov, payload_iov,
+                                         payload_offset, payload_nob);
+        }
+
+        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+                            offsetof(kib_immediate_msg_t, 
+                                     ibim_payload[payload_nob]));
+
+        /* libmsg gets finalized when tx completes */
+        tx->tx_libmsg[0] = libmsg;
+
+        kibnal_launch_tx(tx, nid);
+        return (PTL_OK);
+}
+
+static ptl_err_t
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+               unsigned int payload_niov, struct iovec *payload_iov,
+               size_t payload_offset, size_t payload_len)
+{
+        CDEBUG(D_NET, "  pid = %d, nid="LPU64"\n",
+               pid, nid);
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, payload_iov, NULL,
+                               payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
+                     size_t payload_offset, size_t payload_len)
+{
+        return (kibnal_sendmsg(nal, private, cookie,
+                               hdr, type, nid, pid,
+                               payload_niov, NULL, payload_kiov,
+                               payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+                 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
+                 size_t offset, size_t mlen, size_t rlen)
+{
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        int          msg_nob;
+        
+        LASSERT (mlen <= rlen);
+        LASSERT (!in_interrupt ());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+
+        switch (rxmsg->ibm_type) {
+        default:
+                LBUG();
+                return (PTL_FAIL);
+                
+        case IBNAL_MSG_IMMEDIATE:
+                msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (msg_nob > IBNAL_MSG_SIZE) {
+                        CERROR ("Immediate message from "LPX64" too big: %d\n",
+                                rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
+                        return (PTL_FAIL);
+                }
+
+                if (kiov != NULL)
+                        lib_copy_buf2kiov(niov, kiov, offset,
+                                          rxmsg->ibm_u.immediate.ibim_payload,
+                                          mlen);
+                else
+                        lib_copy_buf2iov(niov, iov, offset,
+                                         rxmsg->ibm_u.immediate.ibim_payload,
+                                         mlen);
+
+                lib_finalize (nal, NULL, libmsg, PTL_OK);
+                return (PTL_OK);
+
+        case IBNAL_MSG_GET_RDMA:
+                /* We get called here just to discard any junk after the
+                 * GET hdr. */
+                LASSERT (libmsg == NULL);
+                lib_finalize (nal, NULL, libmsg, PTL_OK);
+                return (PTL_OK);
+
+        case IBNAL_MSG_PUT_RDMA:
+                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+                                          rx, libmsg, 
+                                          niov, iov, kiov, offset, mlen);
+                return (PTL_OK);
+        }
+}
+
+static ptl_err_t
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+              unsigned int niov, struct iovec *iov, 
+              size_t offset, size_t mlen, size_t rlen)
+{
+        return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+                                offset, mlen, rlen));
+}
+
+static ptl_err_t
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+                     unsigned int niov, ptl_kiov_t *kiov, 
+                     size_t offset, size_t mlen, size_t rlen)
+{
+        return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+                                offset, mlen, rlen));
+}
+
+/*****************************************************************************
+ * the rest of this file concerns connection management.  active connetions
+ * start with connect_peer, passive connections start with passive_callback.
+ * active disconnects start with conn_close, cm_callback starts passive
+ * disconnects and contains the guts of how the disconnect state machine
+ * progresses. 
+ *****************************************************************************/
+
+int
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&kibnal_data.kib_nthreads);
+        return (0);
+}
+
+static void
+kibnal_thread_fini (void)
+{
+        atomic_dec (&kibnal_data.kib_nthreads);
+}
+
+/* this can be called by anyone at any time to close a connection.  if
+ * the connection is still established it heads to the connd to start
+ * the disconnection in a safe context.  It has no effect if called
+ * on a connection that is already disconnecting */
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+        /* This just does the immmediate housekeeping, and schedules the
+         * connection for the connd to finish off.
+         * Caller holds kib_global_lock exclusively in irq context */
+        kib_peer_t   *peer = conn->ibc_peer;
+
+        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
+                                    IBNAL_CONN_DISCONNECTED);
+
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+                return; /* already disconnecting */
+
+        CDEBUG (error == 0 ? D_NET : D_ERROR,
+                "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
+
+        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+                /* kib_connd_conns takes ibc_list's ref */
+                list_del (&conn->ibc_list);
+        } else {
+                /* new ref for kib_connd_conns */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount);
+        }
+        
+        if (list_empty (&peer->ibp_conns) &&
+            peer->ibp_persistence == 0) {
+                /* Non-persistent peer with no more conns... */
+                kibnal_unlink_peer_locked (peer);
+        }
+
+        conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+
+        spin_lock (&kibnal_data.kib_connd_lock);
+
+        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+                
+        spin_unlock (&kibnal_data.kib_connd_lock);
+}
+
+void
+kibnal_close_conn (kib_conn_t *conn, int error)
+{
+        unsigned long     flags;
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        kibnal_close_conn_locked (conn, error);
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+}
+
+static void
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
+{
+        LIST_HEAD        (zombies);
+        kib_tx_t         *tx;
+        unsigned long     flags;
+
+        LASSERT (rc != 0);
+        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        LASSERT (peer->ibp_connecting != 0);
+        peer->ibp_connecting--;
+        if (peer->ibp_connecting != 0) {
+                /* another connection attempt under way (loopback?)... */
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+                return;
+        }
+
+        if (list_empty(&peer->ibp_conns)) {
+                /* Say when active connection can be re-attempted */
+                peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
+                /* Increase reconnection interval */
+                peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
+                                                    IBNAL_MAX_RECONNECT_INTERVAL);
+        
+                /* Take peer's blocked blocked transmits; I'll complete
+                 * them with error */
+                while (!list_empty (&peer->ibp_tx_queue)) {
+                        tx = list_entry (peer->ibp_tx_queue.next,
+                                         kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+                        list_add_tail (&tx->tx_list, &zombies);
+                }
+                
+                if (kibnal_peer_active(peer) &&
+                    (peer->ibp_persistence == 0)) {
+                        /* failed connection attempt on non-persistent peer */
+                        kibnal_unlink_peer_locked (peer);
+                }
+        } else {
+                /* Can't have blocked transmits if there are connections */
+                LASSERT (list_empty(&peer->ibp_tx_queue));
+        }
+        
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        if (!list_empty (&zombies))
+                CERROR ("Deleting messages for "LPX64": connection failed\n",
+                        peer->ibp_nid);
+
+        while (!list_empty (&zombies)) {
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                list_del (&tx->tx_list);
+                /* complete now */
+                tx->tx_status = -EHOSTUNREACH;
+                kibnal_tx_done (tx);
+        }
+}
+
+static void
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
+{
+        int               state = conn->ibc_state;
+        kib_peer_t       *peer = conn->ibc_peer;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               i;
+
+        CDEBUG(D_NET, "Enter kibnal_connreq_done for conn=%p, active=%d, status=%d\n",
+               conn, active, status);
+
+        /* passive connection has no connreq & vice versa */
+        LASSERTF(!active == !(conn->ibc_connreq != NULL),
+                 "%d %p\n", active, conn->ibc_connreq);
+
+        if (active) {
+                PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+                conn->ibc_connreq = NULL;
+        }
+
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        LASSERT (peer->ibp_connecting != 0);
+        
+        if (status == 0) {                         
+                /* connection established... */
+                KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
+                conn->ibc_state = IBNAL_CONN_ESTABLISHED;
+
+                if (!kibnal_peer_active(peer)) {
+                        /* ...but peer deleted meantime */
+                        status = -ECONNABORTED;
+                }
+        } else {
+                KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
+                                            IBNAL_CONN_CONNECTING);
+        }
+
+        if (status == 0) {
+                /* Everything worked! */
+
+                peer->ibp_connecting--;
+
+                /* +1 ref for ibc_list; caller(== CM)'s ref remains until
+                 * the IB_CM_IDLE callback */
+                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_inc (&conn->ibc_refcount);
+                list_add (&conn->ibc_list, &peer->ibp_conns);
+                
+                /* reset reconnect interval for next attempt */
+                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+                /* post blocked sends to the new connection */
+                spin_lock (&conn->ibc_lock);
+                
+                while (!list_empty (&peer->ibp_tx_queue)) {
+                        tx = list_entry (peer->ibp_tx_queue.next, 
+                                         kib_tx_t, tx_list);
+                        
+                        list_del (&tx->tx_list);
+
+                        /* +1 ref for each tx */
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+                        atomic_inc (&conn->ibc_refcount);
+                        kibnal_queue_tx_locked (tx, conn);
+                }
+                
+                spin_unlock (&conn->ibc_lock);
+
+                /* Nuke any dangling conns from a different peer instance... */
+                kibnal_close_stale_conns_locked (conn->ibc_peer,
+                                                 conn->ibc_incarnation);
+
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+                /* queue up all the receives */
+                for (i = 0; i < IBNAL_RX_MSGS; i++) {
+                        /* +1 ref for rx desc */
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+                        atomic_inc (&conn->ibc_refcount);
+
+                        CDEBUG(D_NET, "RX[%d] %p->%p\n",
+                               i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg);
+
+                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
+                }
+
+                kibnal_check_sends (conn);
+                return;
+        }
+
+        /* connection failed */
+        if (state == IBNAL_CONN_CONNECTING) {
+                /* schedule for connd to close */
+                kibnal_close_conn_locked (conn, status);
+        } else {
+                /* Don't have a CM comm_id; just wait for refs to drain */
+                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+        } 
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+
+        /* If we didn't establish the connection we don't have to pass
+         * through the disconnect protocol before dropping the CM ref */
+        if (state < IBNAL_CONN_CONNECTING) 
+                kibnal_put_conn (conn);
+}
+
+static int
+kibnal_accept (kib_conn_t **connp, cm_cep_handle_t *cep,
+                ptl_nid_t nid, __u64 incarnation, int queue_depth)
+{
+        kib_conn_t    *conn = kibnal_create_conn();
+        kib_peer_t    *peer;
+        kib_peer_t    *peer2;
+        unsigned long  flags;
+
+        if (conn == NULL)
+                return (-ENOMEM);
+
+        if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
+                CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
+                       nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
+                atomic_dec (&conn->ibc_refcount);
+                kibnal_destroy_conn(conn);
+                return (-EPROTO);
+        }
+        
+        /* assume 'nid' is a new peer */
+        peer = kibnal_create_peer (nid);
+        if (peer == NULL) {
+                CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
+                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+                       atomic_read (&conn->ibc_refcount));
+                atomic_dec (&conn->ibc_refcount);
+                kibnal_destroy_conn(conn);
+                return (-ENOMEM);
+        }
+        
+        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+        peer2 = kibnal_find_peer_locked(nid);
+        if (peer2 == NULL) {
+                /* peer table takes my ref on peer */
+                list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
+        } else {
+                kib_peer_decref (peer);
+                peer = peer2;
+        }
+
+        kib_peer_addref(peer); /* +1 ref for conn */
+        peer->ibp_connecting++;
+
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+        conn->ibc_peer = peer;
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
+        /* conn->ibc_cep is set when cm_accept is called */
+        conn->ibc_incarnation = incarnation;
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+        *connp = conn;
+        return (0);
+}
+
+static void kibnal_move_qp_to_error(kib_conn_t *conn)
+{
+        vv_qp_attr_t qp_attr;
+        vv_return_t retval;
+
+        qp_attr.modify.qp_modify_into_state = vv_qp_state_error;
+        qp_attr.modify.vv_qp_attr_mask      = VV_QP_AT_STATE;
+        qp_attr.modify.qp_type              = vv_qp_type_r_conn;
+
+        retval = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &qp_attr, &conn->ibc_qp_attrs);
+        if (retval)
+                CERROR("couldn't move qp into error state, error %d\n", retval);
+}
+
+static void kibnal_flush_pending(kib_conn_t *conn)
+{
+        LIST_HEAD        (zombies); 
+        struct list_head *tmp;
+        struct list_head *nxt;
+        kib_tx_t         *tx;
+        unsigned long     flags;
+        int               done;
+
+        /* NB we wait until the connection has closed before completing
+         * outstanding passive RDMAs so we can be sure the network can't 
+         * touch the mapped memory any more. */
+        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
+
+        /* set the QP to the error state so that we get flush callbacks
+         * on our posted receives which can then drop their conn refs */
+        kibnal_move_qp_to_error(conn);
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        /* grab passive RDMAs not waiting for the tx callback */
+        list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                /* still waiting for tx callback? */
+                if (!tx->tx_passive_rdma_wait)
+                        continue;
+
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_passive_rdma_wait = 0;
+                done = (tx->tx_sending == 0);
+
+                if (!done)
+                        continue;
+
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+
+        /* grab all blocked transmits */
+        list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+                
+                list_del (&tx->tx_list);
+                list_add (&tx->tx_list, &zombies);
+        }
+        
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        while (!list_empty(&zombies)) {
+                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+                list_del(&tx->tx_list);
+                kibnal_tx_done (tx);
+        }
+}
+
+static void
+kibnal_reject (cm_cep_handle_t cep, cm_rej_code_t reason)
+{
+        cm_reject_data_t *rej;
+
+        PORTAL_ALLOC(rej, sizeof(*rej));
+        if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */
+                return;  
+
+        rej->reason = reason;
+        cm_reject(cep, rej);
+        PORTAL_FREE(rej, sizeof(*rej));
+}
+
+static void get_av_from_path(ib_path_record_v2_t *path, vv_add_vec_t *av)
+{
+        av->service_level = path->sl;
+        av->grh_flag = 0;       /* TODO: correct? */
+        av->dlid = path->dlid;
+        av->pmtu = path->mtu;
+
+        /* From sdp-hca-params.h. */
+        switch(path->rate) {
+        case 2:
+                av->max_static_rate = 1;
+                break;
+        case 3:
+        case 4:
+        default:
+                av->max_static_rate = 0;
+                break;
+        }
+
+        av->l_ack_timeout = IBNAL_ACK_TIMEOUT;
+        av->retry_count = IBNAL_RETRY;
+        av->rnr_retry_count = IBNAL_RNR_RETRY; 
+        av->source_path_bit = 0;
+
+        av->global_dest.flow_lable = path->flow_label;
+        av->global_dest.hope_limit = path->hop_limut;
+        av->global_dest.traffic_class = path->traffic_class;
+        av->global_dest.s_gid_index = 0;
+        av->global_dest.d_gid = path->dgid;
+};
+
+static vv_return_t
+kibnal_qp_rts(vv_qp_h_t qp_handle, __u32 qpn, __u8 resp_res, 
+              ib_path_record_v2_t *path, __u8 init_depth, __u32 send_psn)
+{
+        vv_qp_attr_t qp_attr;
+        vv_return_t retval;
+
+        ENTRY;
+
+#if 1
+        /* TODO - Hack. I don't know whether I get bad values from the
+         * stack or if I'm using the wrong names. */
+        resp_res = 8;
+        init_depth = 8;
+#endif
+
+        /* RTR */
+        qp_attr.modify.qp_modify_into_state = vv_qp_state_rtr;
+        qp_attr.modify.vv_qp_attr_mask =
+                VV_QP_AT_STATE | 
+                VV_QP_AT_ADD_VEC |
+                VV_QP_AT_DEST_QP |
+                VV_QP_AT_R_PSN |
+                VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
+                VV_QP_AT_MIN_RNR_NAK_T | VV_QP_AT_OP_F;
+
+        qp_attr.modify.qp_type = vv_qp_type_r_conn;
+
+        get_av_from_path(path, &qp_attr.modify.params.rtr.remote_add_vec);
+        qp_attr.modify.params.rtr.destanation_qp = qpn;
+        qp_attr.modify.params.rtr.receive_psn = IBNAL_STARTING_PSN;
+        qp_attr.modify.params.rtr.responder_rdma_r_atom_num = resp_res;
+        qp_attr.modify.params.rtr.opt_min_rnr_nak_timer = 16; /* 20 ms */
+
+        /* For now, force MTU to 1KB (Voltaire's advice). */
+        qp_attr.modify.params.rtr.remote_add_vec.pmtu = vv_mtu_1024;
+
+        retval = vv_qp_modify(kibnal_data.kib_hca, qp_handle, &qp_attr, NULL);
+        if (retval) {
+                CERROR("Cannot modify QP to RTR: %d\n", retval);
+                RETURN(retval);
+        }
+
+        /* RTS */
+        qp_attr.modify.qp_modify_into_state = vv_qp_state_rts;
+        qp_attr.modify.vv_qp_attr_mask = 
+                VV_QP_AT_STATE |
+                VV_QP_AT_L_ACK_T |
+                VV_QP_AT_RETRY_NUM |
+                VV_QP_AT_RNR_NUM |
+                VV_QP_AT_S_PSN |
+                VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM;
+        qp_attr.modify.qp_type = vv_qp_type_r_conn;             
+
+        qp_attr.modify.params.rts.local_ack_timeout = path->pkt_life_time + 2; /* 2 or 1? */ 
+        qp_attr.modify.params.rts.retry_num = IBNAL_RETRY;
+        qp_attr.modify.params.rts.rnr_num = IBNAL_RNR_RETRY;
+        qp_attr.modify.params.rts.send_psn = send_psn;
+        qp_attr.modify.params.rts.dest_out_rdma_r_atom_num = init_depth;
+        qp_attr.modify.params.rts.flow_control = 1; /* Stack does not use it. */
+
+        retval = vv_qp_modify(kibnal_data.kib_hca, qp_handle, &qp_attr, NULL);
+        if (retval) {
+                CERROR("Cannot modify QP to RTS: %d\n", retval);
+        }
+
+        RETURN(retval);
+}
+
+static void
+kibnal_connect_reply (cm_cep_handle_t cep, cm_conn_data_t *info, kib_conn_t *conn)
+{
+        vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs;
+        kib_wire_connreq_t *wcr;
+        cm_reply_data_t *rep = &info->data.reply;
+        cm_rej_code_t reason;
+        vv_return_t retval;
+
+        wcr = (kib_wire_connreq_t *)info->data.reply.priv_data;
+
+        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+                CERROR ("Can't connect "LPX64": bad magic %08x\n",
+                        conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
+                GOTO(reject, reason = cm_rej_code_usr_rej);
+        }
+        
+        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+                CERROR ("Can't connect "LPX64": bad version %d\n",
+                        conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
+                GOTO(reject, reason = cm_rej_code_usr_rej);
+        }
+                        
+        if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
+                CERROR ("Can't connect "LPX64": bad queue depth %d\n",
+                        conn->ibc_peer->ibp_nid, 
+                        le16_to_cpu(wcr->wcr_queue_depth));
+                GOTO(reject, reason = cm_rej_code_usr_rej);
+        }
+                        
+        if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
+                CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
+                        le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
+                GOTO(reject, reason = cm_rej_code_usr_rej);
+        }
+
+        CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+        conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
+        conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+        retval = kibnal_qp_rts(conn->ibc_qp, rep->qpn, 
+                            min_t(__u8, rep->arb_initiator_depth,
+                                  ca_attr->max_read_atom_qp_outstanding),
+                            &conn->ibc_connreq->cr_path, 
+                            min_t(__u8, rep->arb_resp_res,
+                                  ca_attr->max_qp_depth_for_init_read_atom),
+                            rep->start_psn);
+
+        if (retval) {
+                CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n",
+                       conn, conn->ibc_peer->ibp_nid, retval);
+                GOTO(reject, reason = cm_rej_code_no_qp);
+        }
+
+        dump_qp(conn);
+
+        /* the callback arguments are ignored for an active accept */
+        /* TODO: memset cmrtu? */
+        retval = cm_accept(cep, NULL, &conn->ibc_connreq->cr_cm_rtu, kibnal_cm_callback, conn);
+        if (retval) {
+                CERROR("Connection %p -> "LPX64" CMAccept RTU failed: %d\n",
+                       conn, conn->ibc_peer->ibp_nid, retval);
+                kibnal_connreq_done (conn, 1, -ECONNABORTED);
+                /* XXX don't call reject after accept fails? */
+                return;
+        }
+
+        CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+        kibnal_connreq_done (conn, 1, 0);
+
+        return;
+
+reject:
+        kibnal_reject(cep, reason);
+        kibnal_connreq_done (conn, 1, -EPROTO);
+}
+
+/* Off level CM callback */
+static void
+_kibnal_cm_callback(void * arg)
+{
+        struct cm_off_level *cm_tq = arg;
+        cm_cep_handle_t cep = cm_tq->cep;
+        cm_conn_data_t *info = cm_tq->info;
+        kib_conn_t *conn = cm_tq->conn;
+        vv_return_t retval;
+
+        CDEBUG(D_NET, "CM event 0x%x for CEP %p\n", info->status, cep);
+
+        PORTAL_FREE(cm_tq, sizeof(*cm_tq));
+
+        /* Established Connection Notifier */
+        switch (info->status) {
+        case cm_event_connected:
+                CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
+                       conn, conn->ibc_peer->ibp_nid);
+                kibnal_connreq_done (conn, 0, 0);
+                break;
+
+        case cm_event_conn_timeout:
+        case cm_event_conn_reject:
+                /* TODO: be sure this is called only if REQ times out. */
+                CERROR("connection timed out\n");
+                LASSERT(conn->ibc_state == IBNAL_CONN_CONNECTING);
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                break;
+
+        case cm_event_conn_reply:
+                kibnal_connect_reply(cep, info, conn);
+                break;
+
+        case cm_event_disconn_request:
+                /* XXX lock around these state management bits? */
+                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+                        kibnal_close_conn (conn, 0);
+                conn->ibc_state = IBNAL_CONN_DREP;
+                
+                retval = cm_disconnect(conn->ibc_cep, NULL, &kibnal_data.cm_data.drep_data);
+                if (retval)
+                        CERROR("disconnect rep failed: %d\n", retval);
+
+                /* Fall through ... */
+
+        /* these both guarantee that no more cm callbacks will occur */
+        case cm_event_disconnected: /* aka cm_event_disconn_timeout */
+        case cm_event_disconn_reply:
+                CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n",
+                       conn, conn->ibc_peer->ibp_nid);
+
+                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+                kibnal_flush_pending(conn);
+                kibnal_put_conn(conn);        /* Lose CM's ref */
+                break;
+
+        default:
+                CERROR("unknown status %d on Connection %p -> "LPX64"\n",
+                       info->status, conn, conn->ibc_peer->ibp_nid);
+                LBUG();
+                break;
+        }
+
+        return;
+}
+
+static void
+kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg)
+{
+        struct cm_off_level *cm_tq;
+
+        LASSERT(cep);
+        LASSERT(info);
+
+        CDEBUG(D_NET, "CM event 0x%x for CEP %p\n", info->status, cep);
+
+        PORTAL_ALLOC_ATOMIC(cm_tq, sizeof(*cm_tq));
+        if (cm_tq == NULL) {
+                CERROR("Failed to allocate a CM off level structure\n");
+                return;
+        }
+
+        cm_tq->tq.sync = 0;
+        cm_tq->tq.routine = _kibnal_cm_callback;
+        cm_tq->tq.data = cm_tq;
+
+        cm_tq->cep = cep;
+        cm_tq->info = info;
+        cm_tq->conn = (kib_conn_t *)arg;
+
+        schedule_task(&cm_tq->tq);
+}
+
+static int
+kibnal_set_cm_flags(cm_cep_handle_t cep)
+{
+#ifdef TODO
+voltaire cm doesnot appear to have that functionnality
+        FSTATUS frc;
+        uint32 value = 1;
+
+        frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
+                                 (char *)&value, sizeof(value), 0);
+        if (frc != FSUCCESS) {
+                CERROR("error setting timeout callback: %d\n", frc);
+                return -1;
+        }
+
+#if 0
+        frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
+                                 sizeof(value), 0);
+        if (frc != FSUCCESS) {
+                CERROR("error setting async accept: %d\n", frc);
+                return -1;
+        }
+#endif
+#endif
+
+        return 0;
+}
+
+/* Off level listen callback */
+static void
+_kibnal_listen_callback(void *arg)
+{
+        struct cm_off_level *cm_tq = arg;
+        cm_cep_handle_t cep = cm_tq->cep;
+        cm_conn_data_t *info = cm_tq->info;
+        vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs;
+        cm_request_data_t  *req;
+        cm_reply_data_t    *rep = NULL;
+        kib_wire_connreq_t *wcr;
+        kib_conn_t         *conn = NULL;
+        cm_rej_code_t       reason = 0;
+        int                 rc = 0;
+        vv_return_t         retval;
+        vv_qp_attr_t       *query;
+        void               *qp_context;
+
+        LASSERT(cep);
+        LASSERT(info);
+
+        CDEBUG(D_NET, "LISTEN status 0x%x for CEP %p\n", info->status, cep);
+
+        PORTAL_FREE(cm_tq, sizeof(*cm_tq));
+
+        req = &info->data.request;
+        wcr = (kib_wire_connreq_t *)req->priv_data;
+
+        CDEBUG(D_NET, "%d from "LPX64"\n", info->status, 
+               le64_to_cpu(wcr->wcr_nid));
+        
+#ifdef TODO
+        is there an equivalent?
+        if (info->status == FCM_CONNECT_CANCEL)
+                return;
+#endif
+        
+        LASSERT (info->status == cm_event_conn_request);
+        
+        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+                CERROR ("Can't accept: bad magic %08x\n",
+                        le32_to_cpu(wcr->wcr_magic));
+                GOTO(out, reason = cm_rej_code_usr_rej);
+        }
+
+        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+                CERROR ("Can't accept: bad version %d\n",
+                        le16_to_cpu(wcr->wcr_magic));
+                GOTO(out, reason = cm_rej_code_usr_rej);
+        }
+
+        rc = kibnal_accept(&conn, cep,
+                           le64_to_cpu(wcr->wcr_nid),
+                           le64_to_cpu(wcr->wcr_incarnation),
+                           le16_to_cpu(wcr->wcr_queue_depth));
+        if (rc != 0) {
+                CERROR ("Can't accept "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), rc);
+                GOTO(out, reason = cm_rej_code_no_res);
+        }
+
+        /* TODO: I hope I got the ca_attr names correctly. */
+        retval = kibnal_qp_rts(conn->ibc_qp, req->cep_data.qpn,
+                            min_t(__u8, req->cep_data.offered_initiator_depth, 
+                                  ca_attr->max_read_atom_qp_outstanding),
+                            &req->path_data.path,
+                            min_t(__u8, req->cep_data.offered_resp_res, 
+                                  ca_attr->max_qp_depth_for_init_read_atom),
+                            req->cep_data.start_psn);
+
+        if (retval) {
+                CERROR ("Can't mark QP RTS/RTR  "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), retval);
+                GOTO(out, reason = cm_rej_code_no_qp);
+        }
+
+        dump_qp(conn);
+
+        retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs);
+        if (retval) {
+                CERROR ("Couldn't query qp attributes "LPX64": %d\n",
+                        le64_to_cpu(wcr->wcr_nid), retval);
+                GOTO(out, reason = cm_rej_code_no_qp);
+        }
+        query = &conn->ibc_qp_attrs;
+
+        PORTAL_ALLOC(rep, sizeof(*rep));
+        if (rep == NULL) {
+                CERROR ("can't reply and receive buffers\n");
+                GOTO(out, reason = cm_rej_code_insuff_resp_res);
+        }
+
+        /* don't try to deref this into the incoming wcr :) */
+        wcr = (kib_wire_connreq_t *)rep->priv_data;
+
+        *rep = (cm_reply_data_t) {
+                .qpn = query->query.qp_num,
+                .start_psn = query->query.receve_psn,
+                .arb_resp_res = query->query.rdma_r_atom_outstand_num,
+                .arb_initiator_depth = query->query.rdma_r_atom_outstand_num,
+                .targ_ack_delay = 0,
+                .failover_accepted = 0,
+                .end_to_end_flow_ctrl = 1, /* (query->query.flow_control is never set) */
+                .rnr_retry_count = req->cep_data.rtr_retry_cnt,
+        };
+
+        *wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+        };
+
+        retval = cm_accept(cep, rep, NULL, kibnal_cm_callback, conn);
+
+        PORTAL_FREE(rep, sizeof(*rep));
+
+        if (retval) {
+                /* XXX it seems we don't call reject after this point? */
+                CERROR("cm_accept() failed: %d, aborting\n", retval);
+                rc = -ECONNABORTED;
+                goto out;
+        }
+
+        if (kibnal_set_cm_flags(conn->ibc_cep)) {
+                rc = -ECONNABORTED;
+                goto out;
+        }
+
+        conn->ibc_cep = cep;
+
+        CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
+               conn, conn->ibc_peer->ibp_nid);
+
+out:
+        if (reason) {
+                kibnal_reject(cep, reason);
+                rc = -ECONNABORTED;
+        }
+
+        return;
+}
+
+void
+kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg)
+{
+        struct cm_off_level *cm_tq;
+
+        LASSERT(cep);
+        LASSERT(info);
+        LASSERT(arg == NULL); /* no conn yet for passive */
+
+        PORTAL_ALLOC_ATOMIC(cm_tq, sizeof(*cm_tq));
+        if (cm_tq == NULL) {
+                CERROR("Failed to allocate a CM off level structure\n");
+                return;
+        }
+
+        cm_tq->tq.sync = 0;
+        cm_tq->tq.routine = _kibnal_listen_callback;
+        cm_tq->tq.data = cm_tq;
+
+        cm_tq->cep = cep;
+        cm_tq->info = info;
+        cm_tq->conn = NULL;
+
+        schedule_task(&cm_tq->tq);
+}
+
+static void
+kibnal_pathreq_callback (struct sa_request *request)
+{
+        vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs;
+        kib_conn_t *conn = request->context;
+        gsi_dtgrm_t *dtgrm;
+        sa_mad_v2_t *mad;
+        ib_path_record_v2_t *path;
+        u64 component_mask;
+        cm_return_t cmret;
+
+        if (request->status) {
+                CERROR ("status %d\n", request->status);
+                free_sa_request(request);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        dtgrm = request->dtgrm_resp;
+        mad = (sa_mad_v2_t *) dtgrm->mad;
+        path = (ib_path_record_v2_t *) mad->payload;
+
+        /* Put the path record in host order for that stack. */
+        gid_swap(&path->sgid);
+        gid_swap(&path->dgid);
+        path->slid = be16_to_cpu(path->slid);
+        path->dlid = be16_to_cpu(path->dlid);
+        path->flow_label = be32_to_cpu(path->flow_label);
+        path->pkey = be16_to_cpu(path->pkey);
+        path->sl = be16_to_cpu(path->sl);
+
+        CDEBUG(D_NET, "sgid "LPX64":"LPX64" dgid "
+               LPX64":"LPX64" pkey %x\n",
+               path->sgid.scope.g.subnet,
+               path->sgid.scope.g.eui64,
+               path->dgid.scope.g.subnet,
+               path->dgid.scope.g.eui64,
+               path->pkey);
+
+#if TODO
+        component_mask = be64_to_cpu(mad->component_mask);
+        if ((component_mask && (1ull << 1)) == 0) {
+                CERROR ("no servivce GID in SR: "LPX64"\n", component_mask);
+                free_sa_request(request);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+#endif
+
+        conn->ibc_connreq->cr_path = *path;
+
+        free_sa_request(request);    
+
+        conn->ibc_cep = cm_create_cep(cm_cep_transp_rc);
+        if (conn->ibc_cep == NULL) {
+                CERROR ("Can't create CEP\n");
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        if (kibnal_set_cm_flags(conn->ibc_cep)) {
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
+                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
+                .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
+                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+        };
+
+        conn->ibc_connreq->cr_cm_req = (cm_request_data_t) {
+                .sid = kibnal_data.kib_service_id,
+                .cep_data = (cm_cep_data_t) { 
+                        .ca_guid = kibnal_data.kib_hca_attrs.guid,
+                        .end_to_end_flow_ctrl = 1,
+                        .port_guid = kibnal_data.kib_port_gid.scope.g.eui64,
+                        .local_port_num = kibnal_data.kib_port,
+                        .start_psn = IBNAL_STARTING_PSN,
+                        .qpn = conn->ibc_qp_attrs.query.qp_num,
+                        .retry_cnt = IBNAL_RETRY,
+                        .rtr_retry_cnt = IBNAL_RNR_RETRY,
+                        .ack_timeout = IBNAL_ACK_TIMEOUT,
+                        .offered_resp_res = ca_attr->max_read_atom_qp_outstanding,
+                        .offered_initiator_depth = ca_attr->max_qp_depth_for_init_read_atom,
+                },
+                .path_data = (cm_cep_path_data_t) {
+                        .subn_local = TRUE,
+                        .path = conn->ibc_connreq->cr_path,
+                },
+        };
+
+#if 0
+        /* XXX set timeout just like SDP!!!*/
+        conn->ibc_connreq->cr_path.packet_life = 13;
+#endif
+        /* Flag I'm getting involved with the CM... */
+        conn->ibc_state = IBNAL_CONN_CONNECTING;
+
+#if 0
+        CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
+               conn->ibc_connreq->cr_service.RID.ServiceID, 
+               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+#endif
+
+        memset(conn->ibc_connreq->cr_cm_req.priv_data, 0, 
+               cm_REQ_priv_data_len);
+        memcpy(conn->ibc_connreq->cr_cm_req.priv_data, 
+               &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
+
+        /* kibnal_cm_callback gets my conn ref */
+        cmret = cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cm_req,
+                              kibnal_cm_callback, conn);
+
+        if (cmret) {
+                CERROR ("Connect failed: %d\n", cmret);
+                /* Back out state change as connect failed */
+                conn->ibc_state = IBNAL_CONN_INIT_QP;
+                kibnal_connreq_done (conn, 1, -EINVAL);
+        }
+
+        CDEBUG(D_NET, "connection REQ sent\n");
+}
+
+static void
+kibnal_service_get_callback (struct sa_request *request)
+{
+        kib_conn_t *conn = request->context;
+        gsi_dtgrm_t *dtgrm;
+        sa_mad_v2_t *mad;
+        ib_service_record_v2_t *sr;
+        u64 component_mask;
+        int ret;
+
+        if (request->status) {
+                CERROR ("status %d\n", request->status);
+                free_sa_request(request);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        dtgrm = request->dtgrm_resp;
+        mad = (sa_mad_v2_t *) dtgrm->mad;
+        sr = (ib_service_record_v2_t *) mad->payload;
+
+        CDEBUG(D_NET, "sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
+               sr->service_id,
+               sr->service_gid.scope.g.subnet,
+               sr->service_gid.scope.g.eui64,
+               sr->service_pkey);
+
+        component_mask = be64_to_cpu(mad->component_mask);
+        if ((component_mask && (1ull << 1)) == 0) {
+                CERROR ("no service GID in SR: "LPX64"\n", component_mask);
+                free_sa_request(request);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+                return;
+        }
+
+        //conn->ibc_connreq->cr_service = sr;
+
+        /* Return the response datagram to its pool. We don't need it anymore. */
+        gsi_dtgrm_pool_put(request->dtgrm_resp);
+        request->dtgrm_resp = NULL;
+
+        /* kibnal_pathreq_callback gets my conn ref */
+        ret = kibnal_pathrecord_op(request, sr->service_gid, kibnal_pathreq_callback, conn);
+        if (ret) {
+                CERROR ("Path record request failed: %d\n", ret);
+                kibnal_connreq_done (conn, 1, -EINVAL);
+        }
+
+        return;
+}
+
+static void
+kibnal_connect_peer (kib_peer_t *peer)
+{
+        kib_conn_t  *conn = kibnal_create_conn();
+        struct sa_request *request;
+        int ret;
+
+        LASSERT (peer->ibp_connecting != 0);
+
+        if (conn == NULL) {
+                CERROR ("Can't allocate conn\n");
+                kibnal_peer_connect_failed (peer, 1, -ENOMEM);
+                return;
+        }
+
+        conn->ibc_peer = peer;
+        kib_peer_addref(peer);
+
+        PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+        if (conn->ibc_connreq == NULL) {
+                CERROR ("Can't allocate connreq\n");
+                kibnal_connreq_done (conn, 1, -ENOMEM);
+                return;
+        }
+
+        memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+
+        /* kibnal_service_get_callback gets my conn ref */
+        ret = kibnal_advertize_op(peer->ibp_nid, SUBN_ADM_GET, kibnal_service_get_callback, conn);
+
+        if (ret) {
+                CERROR("kibnal_advertize_op failed for op %d NID "LPX64"\n", SUBN_ADM_GET, peer->ibp_nid);
+                /* TODO: I'm unsure yet whether ret contains a
+                 * consistent error type, so I return -EIO in the
+                 * meantime. */
+                kibnal_connreq_done (conn, 1, -EIO);
+        }
+
+        return;
+}
+
+static int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+        kib_tx_t          *tx;
+        struct list_head  *ttmp;
+        unsigned long      flags;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        list_for_each (ttmp, &conn->ibc_tx_queue) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (!tx->tx_passive_rdma_wait);
+                LASSERT (tx->tx_sending == 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
+        list_for_each (ttmp, &conn->ibc_active_txs) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                LASSERT (tx->tx_passive_rdma ||
+                         !tx->tx_passive_rdma_wait);
+
+                LASSERT (tx->tx_passive_rdma_wait ||
+                         tx->tx_sending != 0);
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        return 1;
+                }
+        }
+
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        return 0;
+}
+
+static void
+kibnal_check_conns (int idx)
+{
+        struct list_head  *peers = &kibnal_data.kib_peers[idx];
+        struct list_head  *ptmp;
+        kib_peer_t        *peer;
+        kib_conn_t        *conn;
+        struct list_head  *ctmp;
+
+ again:
+        /* NB. We expect to have a look at all the peers and not find any
+         * rdmas to time out, so we just use a shared lock while we
+         * take a look... */
+        read_lock (&kibnal_data.kib_global_lock);
+
+        list_for_each (ptmp, peers) {
+                peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+                list_for_each (ctmp, &peer->ibp_conns) {
+                        conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+
+                        /* In case we have enough credits to return via a
+                         * NOOP, but there were no non-blocking tx descs
+                         * free to do it last time... */
+                        kibnal_check_sends(conn);
+
+                        if (!kibnal_conn_timed_out(conn))
+                                continue;
+                        
+                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+                               conn, conn->ibc_state, peer->ibp_nid,
+                               atomic_read (&conn->ibc_refcount));
+
+                        atomic_inc (&conn->ibc_refcount);
+                        read_unlock (&kibnal_data.kib_global_lock);
+
+                        CERROR("Timed out RDMA with "LPX64"\n",
+                               peer->ibp_nid);
+
+                        kibnal_close_conn (conn, -ETIMEDOUT);
+                        kibnal_put_conn (conn);
+
+                        /* start again now I've dropped the lock */
+                        goto again;
+                }
+        }
+
+        read_unlock (&kibnal_data.kib_global_lock);
+}
+
+static void
+kib_connd_handle_state(kib_conn_t *conn)
+{
+        vv_return_t retval;
+
+        switch (conn->ibc_state) {
+                /* all refs have gone, free and be done with it */ 
+                case IBNAL_CONN_DISCONNECTED:
+                        kibnal_destroy_conn (conn);
+                        return; /* avoid put_conn */
+
+                case IBNAL_CONN_SEND_DREQ:
+                        
+                        retval = cm_disconnect(conn->ibc_cep, &kibnal_data.cm_data.dreq_data, NULL);
+                        if (retval) /* XXX do real things */
+                                CERROR("disconnect failed: %d\n", retval);
+                        
+                        conn->ibc_state = IBNAL_CONN_DREQ;
+                        break;
+
+                /* a callback got to the conn before we did */ 
+                case IBNAL_CONN_DREP:
+                        break;
+                                
+                default:
+                        CERROR ("Bad conn %p state: %d\n", conn, 
+                                conn->ibc_state);
+                        LBUG();
+                        break;
+        }
+
+        /* drop ref from close_conn */
+        kibnal_put_conn(conn);
+}
+
+int
+kibnal_connd (void *arg)
+{
+        wait_queue_t       wait;
+        unsigned long      flags;
+        kib_conn_t        *conn;
+        kib_peer_t        *peer;
+        int                timeout;
+        int                i;
+        int                peer_index = 0;
+        unsigned long      deadline = jiffies;
+        
+        kportal_daemonize ("kibnal_connd");
+        kportal_blockallsigs ();
+
+        init_waitqueue_entry (&wait, current);
+
+        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+        for (;;) {
+                if (!list_empty (&kibnal_data.kib_connd_conns)) {
+                        conn = list_entry (kibnal_data.kib_connd_conns.next,
+                                           kib_conn_t, ibc_list);
+                        list_del (&conn->ibc_list);
+                        
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+                        kib_connd_handle_state(conn);
+
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                        continue;
+                }
+
+                if (!list_empty (&kibnal_data.kib_connd_peers)) {
+                        peer = list_entry (kibnal_data.kib_connd_peers.next,
+                                           kib_peer_t, ibp_connd_list);
+                        
+                        list_del_init (&peer->ibp_connd_list);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+                        kibnal_connect_peer (peer);
+                        kib_peer_decref (peer);
+
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                }
+
+                /* shut down and nobody left to reap... */
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
+                        break;
+
+                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+                /* careful with the jiffy wrap... */
+                while ((timeout = (int)(deadline - jiffies)) <= 0) {
+                        const int n = 4;
+                        const int p = 1;
+                        int       chunk = kibnal_data.kib_peer_hash_size;
+                        
+                        /* Time to check for RDMA timeouts on a few more
+                         * peers: I do checks every 'p' seconds on a
+                         * proportion of the peer table and I need to check
+                         * every connection 'n' times within a timeout
+                         * interval, to ensure I detect a timeout on any
+                         * connection within (n+1)/n times the timeout
+                         * interval. */
+
+                        if (kibnal_tunables.kib_io_timeout > n * p)
+                                chunk = (chunk * n * p) / 
+                                        kibnal_tunables.kib_io_timeout;
+                        if (chunk == 0)
+                                chunk = 1;
+
+                        for (i = 0; i < chunk; i++) {
+                                kibnal_check_conns (peer_index);
+                                peer_index = (peer_index + 1) % 
+                                             kibnal_data.kib_peer_hash_size;
+                        }
+
+                        deadline += p * HZ;
+                }
+
+                kibnal_data.kib_connd_waketime = jiffies + timeout;
+
+                set_current_state (TASK_INTERRUPTIBLE);
+                add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+                if (!kibnal_data.kib_shutdown &&
+                    list_empty (&kibnal_data.kib_connd_conns) &&
+                    list_empty (&kibnal_data.kib_connd_peers))
+                        schedule_timeout (timeout);
+
+                set_current_state (TASK_RUNNING);
+                remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+                spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+        }
+
+        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+        kibnal_thread_fini ();
+        return (0);
+}
+
+int
+kibnal_scheduler(void *arg)
+{
+        long            id = (long)arg;
+        char            name[16];
+        kib_rx_t       *rx;
+        kib_tx_t       *tx;
+        unsigned long   flags;
+        int             rc;
+        int             counter = 0;
+        int             did_something;
+
+        snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
+        kportal_daemonize(name);
+        kportal_blockallsigs();
+
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+
+        for (;;) {
+                did_something = 0;
+
+                while (!list_empty(&kibnal_data.kib_sched_txq)) {
+                        tx = list_entry(kibnal_data.kib_sched_txq.next,
+                                        kib_tx_t, tx_list);
+                        list_del(&tx->tx_list);
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+                        kibnal_tx_done(tx);
+
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+
+                if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+                        rx = list_entry(kibnal_data.kib_sched_rxq.next,
+                                        kib_rx_t, rx_list);
+                        list_del(&rx->rx_list);
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+
+                        kibnal_rx(rx);
+
+                        did_something = 1;
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+
+                /* shut down and no receives to complete... */
+                if (kibnal_data.kib_shutdown &&
+                    atomic_read(&kibnal_data.kib_nconns) == 0)
+                        break;
+
+                /* nothing to do or hogging CPU */
+                if (!did_something || counter++ == IBNAL_RESCHED) {
+                        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                               flags);
+                        counter = 0;
+
+                        if (!did_something) {
+                                rc = wait_event_interruptible(
+                                        kibnal_data.kib_sched_waitq,
+                                        !list_empty(&kibnal_data.kib_sched_txq) || 
+                                        !list_empty(&kibnal_data.kib_sched_rxq) || 
+                                        (kibnal_data.kib_shutdown &&
+                                         atomic_read (&kibnal_data.kib_nconns) == 0));
+                        } else {
+                                our_cond_resched();
+                        }
+
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+                                          flags);
+                }
+        }
+
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+
+        kibnal_thread_fini();
+        return (0);
+}
+
+
+lib_nal_t kibnal_lib = {
+        .libnal_data = &kibnal_data,      /* NAL private data */
+        .libnal_send = kibnal_send,
+        .libnal_send_pages = kibnal_send_pages,
+        .libnal_recv = kibnal_recv,
+        .libnal_recv_pages = kibnal_recv_pages,
+        .libnal_dist = kibnal_dist
+};
diff --git a/lnet/klnds/viblnd/vibnal_sa.c b/lnet/klnds/viblnd/vibnal_sa.c

new file mode 100644 (file)

index 0000000..c8ff098
--- /dev/null
+++ b/lnet/klnds/viblnd/vibnal_sa.c
@@ -0,0 +1,333 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Frank Zago <fzago@systemfabricworks.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "vibnal.h"
+
+/*--------------------------------------------------------------------------*/
+
+struct sa_request *alloc_sa_request(void)
+{
+        struct sa_request *request;
+        gsi_dtgrm_t *dtgrm;
+        vv_return_t retval;
+
+        PORTAL_ALLOC(request, sizeof(*request));
+        if (request == NULL)
+                return NULL;
+        
+        retval = gsi_dtgrm_pool_get(kibnal_data.gsi_pool_handle, &dtgrm);
+        if (retval) {
+                CERROR("cannot get a datagram: %d\n", retval);
+                PORTAL_FREE(request, sizeof(*request));
+                return NULL;
+        }
+
+        memset(request, 0, sizeof(*request));
+
+        request->dtgrm_req = dtgrm;
+        request->retry = GSI_RETRY;    /* retry the request up to 10 times */
+
+        return request;
+}
+
+void free_sa_request(struct sa_request *request)
+{
+        if (request) {
+                if (request->dtgrm_req) {
+                        gsi_dtgrm_pool_put(request->dtgrm_req);        
+                }
+
+                if (request->dtgrm_resp) {
+                        gsi_dtgrm_pool_put(request->dtgrm_resp);
+                }
+
+                PORTAL_FREE(request, sizeof(*request));
+        }
+}
+
+/*--------------------------------------------------------------------------*/
+
+static void complete_sa_request(struct sa_request *request)
+{
+       if (request->callback) {
+               request->callback(request);
+       } else {
+               complete(&request->signal);
+       }
+}
+
+static void
+sa_request_timeout_handler(unsigned long context)
+{
+       struct sa_request *request = (struct sa_request *)context;
+       int ret;
+       vv_return_t retval;
+
+       if (request->retry--) {
+               /* Resend */
+
+               CDEBUG(D_NET, "timer expired for MAD TID "LPX64" - retrying (%d retry left)\n", request->mad->hdr.transact_id, request->retry);
+               retval = gsi_post_send_dtgrm(kibnal_data.gsi_handle, request->dtgrm_req);
+               if (retval) {
+                       CERROR("gsi_post_send_dtgrm failed: %d\n", retval);
+                       ret = -EIO;
+               } else {
+
+                       /* restart the timer */
+                       request->timer.expires = jiffies + (HZ * GSI_TIMEOUT);
+                       add_timer(&request->timer);
+                       
+                       ret = 0;
+               }
+       } else {
+               CDEBUG(D_NET, "timer expired for MAD TID "LPX64" - no more retry\n", request->mad->hdr.transact_id);
+               ret = ETIMEDOUT;
+       }
+
+       if (ret) {
+               request->status = ret;
+               complete_sa_request(request);
+       }
+}
+
+/*--------------------------------------------------------------------------*/
+
+/* Send a SA request */
+int vibnal_start_sa_request(struct sa_request *request)
+{
+       int ret;
+       vv_return_t vv_stat;
+       int retval;
+
+       CDEBUG (D_NET, "querying SA\n");
+
+       /* Put the request on the pending list and get a transaction ID. */
+       down(&kibnal_data.gsi_mutex);
+
+       list_add_tail(&request->list, &kibnal_data.gsi_pending);
+
+       up(&kibnal_data.gsi_mutex);
+
+       retval = gsi_post_send_dtgrm(kibnal_data.gsi_handle, request->dtgrm_req);
+       if (retval) {
+               CERROR("gsi_post_send_dtgrm failed: %d\n", retval);
+               return -EIO;
+       }
+
+       /* TODO: This might create a race condition if the response has
+        * already been received. */
+       init_timer(&request->timer);
+       request->timer.expires = jiffies + (HZ * GSI_TIMEOUT);
+       request->timer.data = (unsigned long)request;
+       request->timer.function = sa_request_timeout_handler;
+       add_timer(&request->timer);
+
+       CDEBUG(D_NET, "Posted MAD with TID= "LPX64"\n", request->mad->hdr.transact_id);
+       return 0;
+}
+
+/* Received a MAD */
+void
+vibnal_mad_received_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t *dtgrm)
+{
+       sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad;
+       ib_service_record_v2_t *sr = (ib_service_record_v2_t *) mad->payload;
+       struct list_head *this;
+       struct sa_request *request;
+
+       CDEBUG(D_NET, "Received new MAD\n");
+
+       /* Validate the MAD */
+       if (mad->hdr.base_ver != MAD_IB_BASE_VERSION ||
+               mad->hdr.class != MAD_CLASS_SUBN_ADM ||
+        mad->hdr.class_ver != 2) {
+               CDEBUG(D_NET, "ignoring MAD (base_ver=%x, class=%x, class_ver=%x)\n",
+                          mad->hdr.base_ver, mad->hdr.class, mad->hdr.class_ver);
+               return;
+       }
+
+       /* We don't care about queries, only about responses */
+       if (mad->hdr.m.ms.r != 1) {
+               CDEBUG(D_NET, "ignoring MAD (response=%d)\n", mad->hdr.m.ms.r);
+               return;
+       }
+
+       /* We only care about service records and path records. */
+       if (mad->hdr.attrib_id != SA_SERVICE_RECORD &&
+               mad->hdr.attrib_id != SA_PATH_RECORD) {
+               CDEBUG(D_NET, "ignoring MAD (attrib_id=%x)\n", mad->hdr.attrib_id);
+               return;
+       }
+
+       /* Find the MAD request in our list */
+       request = NULL;
+
+       down(&kibnal_data.gsi_mutex);
+
+       list_for_each(this, &kibnal_data.gsi_pending) {
+               struct sa_request *_request = list_entry(this, struct sa_request, list);
+
+               CDEBUG(D_NET, "Comparing pending MAD TID "LPX64" with incoming MAD TID "LPX64"\n",
+                          _request->mad->hdr.transact_id, mad->hdr.transact_id);
+
+               if (_request->mad->hdr.transact_id == mad->hdr.transact_id) {
+                       CDEBUG(D_NET, "TIDs match\n");
+                       request = _request;
+                       break;
+               }
+       }
+
+       if (request == NULL) {
+               up(&kibnal_data.gsi_mutex);
+               CDEBUG(D_NET, "ignoring MAD (TID = "LPX64"\n", mad->hdr.transact_id);
+               return;
+       }
+
+       up(&kibnal_data.gsi_mutex);
+
+       /* Stop the timer and remove the request from the pending list of requests. */
+       del_timer_sync(&request->timer);
+
+       down(&kibnal_data.gsi_mutex);
+
+       list_del(&request->list);
+
+       up(&kibnal_data.gsi_mutex);
+
+       request->dtgrm_resp = dtgrm;
+
+       /* Depending on the response, update the status. This is not exact
+        * because a non-zero status is not always an error, but that
+        * should be good enough right now. */
+       /* TODO: fix. */
+       if (mad->hdr.u.ns.status.raw16) {
+               CDEBUG(D_NET, "MAD response has bad status: %x\n", mad->hdr.u.ns.status.raw16);
+               request->status = -EIO;
+       } else {
+               request->status = 0;
+       }
+
+       CDEBUG(D_NET, "incoming MAD successfully processed (status is %d)\n", request->status);
+
+       complete_sa_request(request);
+}
+
+/* MAD send completion */
+void
+vibnal_mad_sent_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm)
+{
+       sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad;
+       
+       /* Don't do anything. We might have to resend the datagram later. */
+       CDEBUG(D_NET, "Datagram with TID "LPX64" sent.\n", mad->hdr.transact_id);
+}
+
+/* 
+ * method is SUBN_ADM_SET, SUBN_ADM_GET, SUBN_ADM_DELETE. Tables not supported.
+ * nid is the nid to advertize/query/unadvertize
+ * Note: dgid is in network order.
+ */
+static void fill_pathrecord_request(struct sa_request *request, vv_gid_t dgid)
+{
+        gsi_dtgrm_t *dtgrm = request->dtgrm_req;
+        sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad;
+        ib_path_record_v2_t *path = (ib_path_record_v2_t *) mad->payload;
+
+        memset(mad, 0, MAD_BLOCK_SIZE);
+
+        request->mad = mad;
+
+        dtgrm->rlid = kibnal_data.kib_port_attr.port_sma_address_info.sm_lid;
+        dtgrm->sl = kibnal_data.kib_port_attr.port_sma_address_info.service_level;
+
+        mad->hdr.base_ver = MAD_IB_BASE_VERSION;
+        mad->hdr.class = MAD_CLASS_SUBN_ADM;
+        mad->hdr.class_ver = 2;
+        mad->hdr.m.ms.method = SUBN_ADM_GET;
+               mad->hdr.attrib_id = SA_PATH_RECORD; /* something(?) will swap that field */
+               mad->hdr.attrib_modifier = 0xFFFFFFFF; /* and that one too? */
+
+               /* Note: the transaction ID is set by the Voltaire stack if it is 0. */
+
+        /* TODO: these harcoded value to something better */
+        mad->payload_len = cpu_to_be32(0x40 /*header size*/ + 0x35 /* PathRecord size */);
+
+        mad->component_mask = cpu_to_be64(
+                                                                        (1 << 2) | /* DGID      */
+                                                                        (1 << 3) | /* SGID      */
+                                                                        (1 << 12)| /* numb_paths*/
+                                                                        (1 << 13)  /* P_key     */
+                                                                        );
+
+               path->pkey = cpu_to_be16(kibnal_data.kib_port_pkey);
+               path->sgid = kibnal_data.kib_port_gid;
+               gid_swap(&path->sgid);
+               path->dgid = dgid;              /* already in network order */
+               path->numb_path = 1;
+}
+
+/* 
+ * Do a path record query
+ * If callback is NULL, the function is synchronous (and context is ignored).
+ * Note: dgid is in network order.
+ */
+/* TODO: passing a request is a bit of a hack, but since this function
+ * is called under interrupt, we cannot allocate memory here :(. */
+int kibnal_pathrecord_op(struct sa_request *request, vv_gid_t dgid, sa_request_cb_t callback, void *context)
+{
+        int ret;
+
+        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+        fill_pathrecord_request(request, dgid);
+
+        if (callback) {
+                request->callback = callback;
+                request->context = context;
+        } else {
+                init_completion(&request->signal);
+        }
+
+        ret = vibnal_start_sa_request(request);
+        if (ret) {
+                CERROR("vibnal_send_sa failed: %d\n", ret);
+                free_sa_request(request);
+        } else {
+                if (callback) {
+                        /* Return. The callback will have to free the SA request. */
+                        ret = 0;
+                } else {
+                        wait_for_completion(&request->signal);
+
+                        ret = request->status;
+
+                        if (ret != 0) {
+                                CERROR ("Error %d in querying a path record\n", ret);
+                        }
+                        
+                        free_sa_request(request);
+                }
+        }
+
+        return ret;
+}
diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c

index b5286fc..85de4cf 100644 (file)
--- a/lnet/libcfs/debug.c
+++ b/lnet/libcfs/debug.c
@@ -265,6 +265,7 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
          case TCPNAL:
                  /* userspace NAL */
          case IIBNAL:
+        case VIBNAL:
          case OPENIBNAL:
          case RANAL:
          case SOCKNAL:
diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c

index 9c1537b..dbe264b 100644 (file)
--- a/lnet/utils/portals.c
+++ b/lnet/utils/portals.c
@@ -78,6 +78,7 @@ static name2num_t nalnames[] = {
          {"gm",         GMNAL},
          {"openib",      OPENIBNAL},
          {"iib",         IIBNAL},
+        {"vib",         VIBNAL},
          {"lo",          LONAL},
          {"ra",          RANAL},
  #else
@@ -676,7 +677,8 @@ jt_ptl_print_peers (int argc, char **argv)
          int                      index;
          int                      rc;
  
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, 
+                                  OPENIBNAL, IIBNAL, VIBNAL, 0))
                  return -1;
  
          for (index = 0;;index++) {
@@ -693,6 +695,11 @@ jt_ptl_print_peers (int argc, char **argv)
                                  ptl_ipaddr_2_str (pcfg.pcfg_size, buffer[0], 1),
                                  ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1),
                                  pcfg.pcfg_misc, pcfg.pcfg_count);
+                else if (g_nal_is_compatible(NULL, RANAL, 0))
+                        printf (LPX64"[%d]@%s:%d\n",
+                                pcfg.pcfg_nid, pcfg.pcfg_wait,
+                                ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1),
+                                pcfg.pcfg_misc);
                  else
                          printf (LPX64"[%d]\n",
                                  pcfg.pcfg_nid, pcfg.pcfg_wait);
@@ -712,17 +719,18 @@ jt_ptl_add_peer (int argc, char **argv)
          int                      port = 0;
          int                      rc;
  
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, 
+                                  OPENIBNAL, IIBNAL, VIBNAL, 0))
                  return -1;
  
          if (g_nal_is_compatible(NULL, SOCKNAL, RANAL, 0)) {
                  if (argc != 4) {
-                        fprintf (stderr, "usage(tcp): %s nid ipaddr port\n", 
+                        fprintf (stderr, "usage(tcp,ra): %s nid ipaddr port\n", 
                                   argv[0]);
                          return 0;
                  }
          } else if (argc != 2) {
-                fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]);
+                fprintf (stderr, "usage(openib,iib,vib): %s nid\n", argv[0]);
                  return 0;
          }
  
@@ -769,7 +777,8 @@ jt_ptl_del_peer (int argc, char **argv)
          int                      argidx;
          int                      rc;
  
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, 
+                                  OPENIBNAL, IIBNAL, VIBNAL, 0))
                  return -1;
  
          if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
@@ -832,7 +841,8 @@ jt_ptl_print_connections (int argc, char **argv)
          int                      index;
          int                      rc;
  
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0))
+        if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, 
+                                  OPENIBNAL, IIBNAL, VIBNAL, 0))
                  return -1;
  
          for (index = 0;;index++) {
@@ -857,6 +867,10 @@ jt_ptl_print_connections (int argc, char **argv)
                                  pcfg.pcfg_count,        /* tx buffer size */
                                  pcfg.pcfg_size,         /* rx buffer size */
                                  pcfg.pcfg_wait ? "nagle" : "nonagle");
+                else if (g_nal_is_compatible (NULL, RANAL, 0))
+                        printf ("[%d]"LPX64"\n",
+                                pcfg.pcfg_id,       /* device id */
+                                pcfg.pcfg_nid);
                  else
                          printf (LPX64"\n",
                                  pcfg.pcfg_nid);
@@ -1023,7 +1037,8 @@ int jt_ptl_disconnect(int argc, char **argv)
                  return 0;
          }
  
-        if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0))
+        if (!g_nal_is_compatible (NULL, SOCKNAL, RANAL, 
+                                  OPENIBNAL, IIBNAL, VIBNAL, 0))
                  return 0;
  
          if (argc >= 2 &&
author	eeb <eeb>
	Thu, 23 Dec 2004 10:46:27 +0000 (10:46 +0000)
committer	eeb <eeb>
	Thu, 23 Dec 2004 10:46:27 +0000 (10:46 +0000)
lnet/autoconf/lustre-lnet.m4		patch \| blob \| history
lnet/include/linux/kp30.h		patch \| blob \| history
lnet/klnds/Makefile.in		patch \| blob \| history
lnet/klnds/autoMakefile.am		patch \| blob \| history
lnet/klnds/iiblnd/iiblnd.c		patch \| blob \| history
lnet/klnds/iiblnd/iiblnd.h		patch \| blob \| history
lnet/klnds/iiblnd/iiblnd_cb.c		patch \| blob \| history
lnet/klnds/ralnd/ralnd.c		patch \| blob \| history
lnet/klnds/viblnd/.cvsignore	[new file with mode: 0644]	patch \| blob
lnet/klnds/viblnd/Makefile.in	[new file with mode: 0644]	patch \| blob
lnet/klnds/viblnd/Makefile.mk	[new file with mode: 0644]	patch \| blob
lnet/klnds/viblnd/autoMakefile.am	[new file with mode: 0644]	patch \| blob
lnet/klnds/viblnd/viblnd.c	[new file with mode: 0644]	patch \| blob
lnet/klnds/viblnd/viblnd.h	[new file with mode: 0644]	patch \| blob
lnet/klnds/viblnd/viblnd_cb.c	[new file with mode: 0644]	patch \| blob
lnet/klnds/viblnd/vibnal_sa.c	[new file with mode: 0644]	patch \| blob
lnet/libcfs/debug.c		patch \| blob \| history
lnet/utils/portals.c		patch \| blob \| history