Whamcloud - gitweb
LU-1419 lnet: Add support for Cray's Gemini interconnect
authorJames Simmons <uja.ornl@gmail.com>
Wed, 5 Dec 2012 18:54:39 +0000 (13:54 -0500)
committerOleg Drokin <green@whamcloud.com>
Wed, 19 Dec 2012 22:31:17 +0000 (17:31 -0500)
This patch adds LNET support to use Cray's Gemini
interconnect on their newer systems. The gnilnd was
originally based off of the ralnd.

Signed-off-by: James Simmons <uja.ornl@gmail.com>
Signed-off-by: Chris Horn <hornc@cray.com>
Signed-off-by: Cory Spitz <spitzcor@cray.com>
Change-Id: Ia98a44f4f3d68773438d820c49fe554a3d551dc5
Reviewed-on: http://review.whamcloud.com/3381
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
19 files changed:
lnet/autoconf/lustre-lnet.m4
lnet/klnds/Makefile.in
lnet/klnds/autoMakefile.am
lnet/klnds/gnilnd/Makefile.in [new file with mode: 0644]
lnet/klnds/gnilnd/autoMakefile.am [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd.h [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_api_wrap.h [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_cb.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_conn.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_debug.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_hss_ops.h [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_modparams.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_proc.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_stack.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_sysctl.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_version.h [new file with mode: 0644]
lnet/utils/debug.c
lnet/utils/portals.c

index 4cd30eb..9d9ca11 100644 (file)
@@ -526,13 +526,88 @@ AC_SUBST(RACPPFLAGS)
 AC_SUBST(RALND)
 ])
 
+#
+# LN_CONFIG_GNILND
+#
+# check whether to use the Gemini Network Interface lnd
+#
+AC_DEFUN([LN_CONFIG_GNILND],
+[#### Gemini Network Interface
+AC_MSG_CHECKING([whether to enable GNI lnd])
+AC_ARG_ENABLE([gni],
+       AC_HELP_STRING([--enable-gni],
+                       [enable GNI lnd]),
+       [],[enable_gni='no'])
+AC_MSG_RESULT([$enable_gni])
+
+if test x$enable_gni = xyes ; then
+       AC_MSG_CHECKING([if GNI kernel headers are present])
+       # placeholder
+       # GNICPPFLAGS was set in spec file
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS $GNICPPFLAGS"
+       LB_LINUX_TRY_COMPILE([
+               #include <linux/types.h>
+               #include <gni_pub.h>
+       ],[
+               gni_cdm_handle_t        kgni_domain;
+               gni_return_t            rc;
+               int                     rrc;
+
+               rc = gni_cdm_create(0, 1, 1, 0, &kgni_domain);
+
+               rrc = (rc == GNI_RC_SUCCESS) ? 0 : 1;
+
+               return rrc;
+       ],[
+               AC_MSG_RESULT([yes])
+               GNILND="gnilnd"
+       ],[
+               AC_MSG_RESULT([no])
+               AC_MSG_ERROR([can't compile gnilnd with given GNICPPFLAGS: $GNICPPFLAGS])
+       ])
+       # at this point, we have gnilnd basic support, now check for extra features
+       AC_MSG_CHECKING([to use RCA in gnilnd])
+       LB_LINUX_TRY_COMPILE([
+               #include <linux/types.h>
+               #include <gni_pub.h>
+               #include <krca_lib.h>
+       ],[
+               gni_cdm_handle_t        kgni_domain;
+               gni_return_t            rc;
+               krca_ticket_t           ticket = KRCA_NULL_TICKET;
+               int                     rrc;
+               __u32                   nid = 0, nic_addr;
+
+               rc = gni_cdm_create(0, 1, 1, 0, &kgni_domain);
+
+               rrc = (rc == GNI_RC_SUCCESS) ? 0 : 1;
+
+               rrc += krca_nid_to_nicaddrs(nid, 1, &nic_addr);
+
+               rrc += krca_register(&ticket, RCA_MAKE_SERVICE_INDEX(RCA_IO_CLASS, 9), 99, 0);
+
+               return rrc;
+       ],[
+               AC_MSG_RESULT([yes])
+               GNICPPFLAGS="$GNICPPFLAGS -DGNILND_USE_RCA=1"
+               GNILNDRCA="gnilndrca"
+       ],[
+               AC_MSG_RESULT([no])
+       ])
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+fi
+AC_SUBST(GNICPPFLAGS)
+AC_SUBST(GNILNDRCA)
+AC_SUBST(GNILND)
+])
 
 
 #
 #
 # LN_CONFIG_USERSPACE
 #
-# This is defined but empty because it is called from 
+# This is defined but empty because it is called from
 # build/autconf/lustre-build.m4 which is shared by all branches.
 #
 AC_DEFUN([LN_CONFIG_USERSPACE],
@@ -598,6 +673,7 @@ LN_CONFIG_BACKOFF
 LN_CONFIG_QUADRICS
 LN_CONFIG_O2IB
 LN_CONFIG_RALND
+LN_CONFIG_GNILND
 LN_CONFIG_PTLLND
 LN_CONFIG_MX
 # 2.6.32
@@ -740,6 +816,8 @@ AC_DEFUN([LN_CONDITIONALS],
 AM_CONDITIONAL(BUILD_MXLND, test x$MXLND = "xmxlnd")
 AM_CONDITIONAL(BUILD_O2IBLND, test x$O2IBLND = "xo2iblnd")
 AM_CONDITIONAL(BUILD_RALND, test x$RALND = "xralnd")
+AM_CONDITIONAL(BUILD_GNILND, test x$GNILND = "xgnilnd")
+AM_CONDITIONAL(BUILD_GNILND_RCA, test x$GNILNDRCA = "xgnilndrca")
 AM_CONDITIONAL(BUILD_PTLLND, test x$PTLLND = "xptllnd")
 AM_CONDITIONAL(BUILD_USOCKLND, test x$USOCKLND = "xusocklnd")
 ])
@@ -769,6 +847,8 @@ lnet/klnds/qswlnd/Makefile
 lnet/klnds/qswlnd/autoMakefile
 lnet/klnds/ralnd/Makefile
 lnet/klnds/ralnd/autoMakefile
+lnet/klnds/gnilnd/Makefile
+lnet/klnds/gnilnd/autoMakefile
 lnet/klnds/socklnd/Makefile
 lnet/klnds/socklnd/autoMakefile
 lnet/klnds/ptllnd/Makefile
index f0586ae..0d99a87 100644 (file)
@@ -1,5 +1,6 @@
 @BUILD_MXLND_TRUE@subdir-m += mxlnd
 @BUILD_RALND_TRUE@subdir-m += ralnd
+@BUILD_GNILND_TRUE@subdir-m += gnilnd
 @BUILD_O2IBLND_TRUE@subdir-m += o2iblnd
 @BUILD_QSWLND_TRUE@subdir-m += qswlnd
 @BUILD_PTLLND_TRUE@subdir-m += ptllnd
index 57d709c..1591d87 100644 (file)
@@ -34,4 +34,4 @@
 # Lustre is a trademark of Sun Microsystems, Inc.
 #
 
-SUBDIRS = socklnd qswlnd mxlnd ralnd ptllnd o2iblnd
+SUBDIRS = socklnd qswlnd mxlnd ralnd gnilnd ptllnd o2iblnd
diff --git a/lnet/klnds/gnilnd/Makefile.in b/lnet/klnds/gnilnd/Makefile.in
new file mode 100644 (file)
index 0000000..14e8c30
--- /dev/null
@@ -0,0 +1,9 @@
+MODULES := kgnilnd
+kgnilnd-objs := gnilnd.o gnilnd_cb.o gnilnd_modparams.o gnilnd_debug.o gnilnd_proc.o \
+               gnilnd_sysctl.o gnilnd_stack.o gnilnd_conn.o
+
+EXTRA_POST_CFLAGS := -D"SVN_CODE_REV=KBUILD_STR(${SVN_CODE_REV})" @GNICPPFLAGS@
+
+EXTRA_DIST = $(kgnilnd-objs:%.o=%.c) gnilnd.h gnilnd_api_wrap.h
+
+@INCLUDE_RULES@
diff --git a/lnet/klnds/gnilnd/autoMakefile.am b/lnet/klnds/gnilnd/autoMakefile.am
new file mode 100644 (file)
index 0000000..888b68e
--- /dev/null
@@ -0,0 +1,12 @@
+# Copyright (C) 2009  Cray, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if BUILD_GNILND
+modulenet_DATA = kgnilnd$(KMODEXT)
+endif
+endif
+
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c
new file mode 100644 (file)
index 0000000..fcc05fa
--- /dev/null
@@ -0,0 +1,2698 @@
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ *   Author: Igor Gorodetsky <iogordet@cray.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *   Author: James Shimek <jshimek@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "gnilnd.h"
+
+/* Primary entry points from LNET.  There are no guarantees against reentrance. */
+lnd_t the_kgnilnd = {
+       .lnd_type       = GNILND,
+       .lnd_startup    = kgnilnd_startup,
+       .lnd_shutdown   = kgnilnd_shutdown,
+       .lnd_ctl        = kgnilnd_ctl,
+       .lnd_send       = kgnilnd_send,
+       .lnd_recv       = kgnilnd_recv,
+       .lnd_eager_recv = kgnilnd_eager_recv,
+       .lnd_query      = kgnilnd_query,
+};
+
+kgn_data_t      kgnilnd_data;
+kgn_hssops_t   kgnilnd_hssops;
+
+/* needs write_lock on kgn_peer_conn_lock */
+int
+kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn)
+{
+       kgn_conn_t         *conn;
+       struct list_head   *ctmp, *cnxt;
+       int                 loopback;
+       int                 count = 0;
+
+       loopback = peer->gnp_nid == peer->gnp_net->gnn_ni->ni_nid;
+
+       list_for_each_safe(ctmp, cnxt, &peer->gnp_conns) {
+               conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+               if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+                       continue;
+
+               if (conn == newconn)
+                       continue;
+
+               if (conn->gnc_device != newconn->gnc_device)
+                       continue;
+
+               /* This is a two connection loopback - one talking to the other */
+               if (loopback &&
+                   newconn->gnc_my_connstamp == conn->gnc_peer_connstamp &&
+                   newconn->gnc_peer_connstamp == conn->gnc_my_connstamp) {
+                       CDEBUG(D_NET, "skipping prune of %p, "
+                               "loopback and matching stamps"
+                               " connstamp "LPU64"("LPU64")"
+                               " peerstamp "LPU64"("LPU64")\n",
+                               conn, newconn->gnc_my_connstamp,
+                               conn->gnc_peer_connstamp,
+                               newconn->gnc_peer_connstamp,
+                               conn->gnc_my_connstamp);
+                       continue;
+               }
+
+               if (conn->gnc_peerstamp != newconn->gnc_peerstamp) {
+                       LASSERTF(conn->gnc_peerstamp < newconn->gnc_peerstamp,
+                               "conn 0x%p peerstamp "LPU64" >= "
+                               "newconn 0x%p peerstamp "LPU64"\n",
+                               conn, conn->gnc_peerstamp,
+                               newconn, newconn->gnc_peerstamp);
+
+                       CDEBUG(D_NET, "Closing stale conn nid: %s "
+                              " peerstamp:"LPX64"("LPX64")\n",
+                              libcfs_nid2str(peer->gnp_nid),
+                              conn->gnc_peerstamp, newconn->gnc_peerstamp);
+               } else {
+
+                       LASSERTF(conn->gnc_peer_connstamp < newconn->gnc_peer_connstamp,
+                               "conn 0x%p peer_connstamp "LPU64" >= "
+                               "newconn 0x%p peer_connstamp "LPU64"\n",
+                               conn, conn->gnc_peer_connstamp,
+                               newconn, newconn->gnc_peer_connstamp);
+
+                       CDEBUG(D_NET, "Closing stale conn nid: %s"
+                              " connstamp:"LPU64"("LPU64")\n",
+                              libcfs_nid2str(peer->gnp_nid),
+                              conn->gnc_peer_connstamp, newconn->gnc_peer_connstamp);
+               }
+
+               count++;
+               kgnilnd_close_conn_locked(conn, -ESTALE);
+       }
+
+       if (count != 0) {
+               CWARN("Closed %d stale conns to %s\n", count, libcfs_nid2str(peer->gnp_nid));
+       }
+
+       RETURN(count);
+}
+
+int
+kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn)
+{
+       kgn_conn_t       *conn;
+       struct list_head *tmp;
+       int               loopback;
+       ENTRY;
+
+       loopback = peer->gnp_nid == peer->gnp_net->gnn_ni->ni_nid;
+
+       list_for_each(tmp, &peer->gnp_conns) {
+               conn = list_entry(tmp, kgn_conn_t, gnc_list);
+               CDEBUG(D_NET, "checking conn 0x%p for peer %s"
+                       " lo %d new "LPU64" existing "LPU64
+                       " new peer "LPU64" existing peer "LPU64
+                       " new dev %p existing dev %p\n",
+                       conn, libcfs_nid2str(peer->gnp_nid),
+                       loopback,
+                       newconn->gnc_peerstamp, conn->gnc_peerstamp,
+                       newconn->gnc_peer_connstamp, conn->gnc_peer_connstamp,
+                       newconn->gnc_device, conn->gnc_device);
+
+               /* conn is in the process of closing */
+               if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+                       continue;
+
+               /* 'newconn' is from an earlier version of 'peer'!!! */
+               if (newconn->gnc_peerstamp < conn->gnc_peerstamp)
+                       RETURN(1);
+
+               /* 'conn' is from an earlier version of 'peer': it will be
+                * removed when we cull stale conns later on... */
+               if (newconn->gnc_peerstamp > conn->gnc_peerstamp)
+                       continue;
+
+               /* Different devices are OK */
+               if (conn->gnc_device != newconn->gnc_device)
+                       continue;
+
+               /* It's me connecting to myself */
+               if (loopback &&
+                   newconn->gnc_my_connstamp == conn->gnc_peer_connstamp &&
+                   newconn->gnc_peer_connstamp == conn->gnc_my_connstamp)
+                       continue;
+
+               /* 'newconn' is an earlier connection from 'peer'!!! */
+               if (newconn->gnc_peer_connstamp < conn->gnc_peer_connstamp)
+                       RETURN(2);
+
+               /* 'conn' is an earlier connection from 'peer': it will be
+                * removed when we cull stale conns later on... */
+               if (newconn->gnc_peer_connstamp > conn->gnc_peer_connstamp)
+                       continue;
+
+               /* 'newconn' has the SAME connection stamp; 'peer' isn't
+                * playing the game... */
+               RETURN(3);
+       }
+
+       RETURN(0);
+}
+
+int
+kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
+{
+       kgn_conn_t    *conn;
+       gni_return_t   rrc;
+       int            rc = 0;
+
+       LASSERT (!in_interrupt());
+       atomic_inc(&kgnilnd_data.kgn_nconns);
+
+       /* divide by 2 to allow for complete reset and immediate reconnect */
+       if (atomic_read(&kgnilnd_data.kgn_nconns) >= GNILND_MAX_CQID/2) {
+               CERROR("Too many conn are live: %d > %d\n",
+                       atomic_read(&kgnilnd_data.kgn_nconns), GNILND_MAX_CQID/2);
+               atomic_dec(&kgnilnd_data.kgn_nconns);
+               return -E2BIG;
+       }
+
+       LIBCFS_ALLOC(conn, sizeof(*conn));
+       if (conn == NULL) {
+               atomic_dec(&kgnilnd_data.kgn_nconns);
+               return -ENOMEM;
+       }
+
+       LIBCFS_ALLOC(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
+       if (conn->gnc_tx_ref_table == NULL) {
+               CERROR("Can't allocate conn tx_ref_table\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       atomic_set(&conn->gnc_refcount, 1);
+       atomic_set(&conn->gnc_reaper_noop, 0);
+       atomic_set(&conn->gnc_sched_noop, 0);
+       INIT_LIST_HEAD(&conn->gnc_list);
+       INIT_LIST_HEAD(&conn->gnc_hashlist);
+       INIT_LIST_HEAD(&conn->gnc_schedlist);
+       INIT_LIST_HEAD(&conn->gnc_fmaq);
+       INIT_LIST_HEAD(&conn->gnc_mdd_list);
+       spin_lock_init(&conn->gnc_list_lock);
+       spin_lock_init(&conn->gnc_tx_lock);
+
+       /* set tx id to nearly the end to make sure we find wrapping
+        * issues soon */
+       conn->gnc_next_tx = (int) GNILND_MAX_MSG_ID - 10;
+
+       /* if this fails, we have conflicts and MAX_TX is too large */
+       CLASSERT(GNILND_MAX_MSG_ID < GNILND_MSGID_CLOSE);
+
+       /* get a new unique CQ id for this conn */
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       conn->gnc_my_connstamp = kgnilnd_data.kgn_connstamp++;
+       conn->gnc_cqid = kgnilnd_get_cqid_locked();
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       if (conn->gnc_cqid == 0) {
+               CERROR("Could not allocate unique CQ ID for conn 0x%p\n", conn);
+               rc = -E2BIG;
+               GOTO(failed, rc);
+       }
+
+       CDEBUG(D_NET, "alloc cqid %u for conn 0x%p\n",
+               conn->gnc_cqid, conn);
+
+       /* need to be set before gnc_ephandle to allow kgnilnd_destroy_conn_ep to
+        * check context */
+       conn->gnc_device = dev;
+
+       conn->gnc_timeout = MAX(*kgnilnd_tunables.kgn_timeout,
+                               GNILND_MIN_TIMEOUT);
+       kgnilnd_update_reaper_timeout(conn->gnc_timeout);
+
+       /* this is the ep_handle for doing SMSG & BTE */
+       mutex_lock(&dev->gnd_cq_mutex);
+       rrc = kgnilnd_ep_create(dev->gnd_handle, dev->gnd_snd_fma_cqh,
+                               &conn->gnc_ephandle);
+       mutex_unlock(&dev->gnd_cq_mutex);
+       if (rrc != GNI_RC_SUCCESS) {
+               rc = -ENETDOWN;
+               GOTO(failed, rc);
+       }
+
+       CDEBUG(D_NET, "created conn 0x%p ep_hndl 0x%p\n",
+              conn, conn->gnc_ephandle);
+
+       /* add ref for EP canceling */
+       kgnilnd_conn_addref(conn);
+       atomic_inc(&dev->gnd_neps);
+
+       *connp = conn;
+       return 0;
+
+failed:
+       atomic_dec(&kgnilnd_data.kgn_nconns);
+       LIBCFS_FREE(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
+       LIBCFS_FREE(conn, sizeof(*conn));
+       return rc;
+}
+
+/* needs to be called with kgn_peer_conn_lock held (read or write) */
+kgn_conn_t *
+kgnilnd_find_conn_locked(kgn_peer_t *peer)
+{
+       kgn_conn_t      *conn = NULL;
+       ENTRY;
+
+       /* if we are in reset, this conn is going to die soon */
+       if (unlikely(kgnilnd_data.kgn_in_reset)) {
+               RETURN(NULL);
+       }
+
+       /* just return the first ESTABLISHED connection */
+       list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+               /* kgnilnd_finish_connect doesn't put connections on the
+                * peer list until they are actually established */
+               LASSERTF(conn->gnc_state >= GNILND_CONN_ESTABLISHED,
+                       "found conn %p state %s on peer %p (%s)\n",
+                       conn, kgnilnd_conn_state2str(conn), peer,
+                       libcfs_nid2str(peer->gnp_nid));
+               if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+                       continue;
+
+               RETURN(conn);
+       }
+       RETURN(NULL);
+}
+
+/* needs write_lock on kgn_peer_conn_lock held */
+kgn_conn_t *
+kgnilnd_find_or_create_conn_locked(kgn_peer_t *peer) {
+
+       kgn_device_t    *dev = peer->gnp_net->gnn_dev;
+       kgn_conn_t      *conn;
+
+       conn = kgnilnd_find_conn_locked(peer);
+
+       if (conn != NULL) {
+               return conn;
+       }
+
+       /* if the peer was previously connecting, check if we should
+        * trigger another connection attempt yet. */
+       if (time_before(jiffies, peer->gnp_reconnect_time)) {
+               return NULL;
+       }
+
+       /* This check prevents us from creating a new connection to a peer while we are
+        * still in the process of closing an existing connection to the peer.
+        */
+       list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+               if (conn->gnc_ephandle != NULL) {
+                       CDEBUG(D_NET, "Not connecting non-null ephandle found peer 0x%p->%s\n", peer,
+                               libcfs_nid2str(peer->gnp_nid));
+                       return NULL;
+               }
+       }
+
+       if (peer->gnp_connecting != GNILND_PEER_IDLE) {
+               /* if we are not connecting, fire up a new connection */
+               /* or if we are anything but IDLE DONT start a new connection */
+              return NULL;
+       }
+
+       CDEBUG(D_NET, "starting connect to %s\n",
+               libcfs_nid2str(peer->gnp_nid));
+       peer->gnp_connecting = GNILND_PEER_CONNECT;
+       kgnilnd_peer_addref(peer); /* extra ref for connd */
+
+       spin_lock(&dev->gnd_connd_lock);
+       list_add_tail(&peer->gnp_connd_list, &dev->gnd_connd_peers);
+       spin_unlock(&dev->gnd_connd_lock);
+
+       kgnilnd_schedule_dgram(dev);
+       CDEBUG(D_NETTRACE, "scheduling new connect\n");
+
+       return NULL;
+}
+
+/* Caller is responsible for deciding if/when to call this */
+void
+kgnilnd_destroy_conn_ep(kgn_conn_t *conn)
+{
+       gni_return_t    rrc;
+       gni_ep_handle_t tmp_ep;
+
+       /* only if we actually initialized it,
+        *  then set NULL to tell kgnilnd_destroy_conn to leave it alone */
+
+       tmp_ep = xchg(&conn->gnc_ephandle, NULL);
+       if (tmp_ep != NULL) {
+               /* we never re-use the EP, so unbind is not needed */
+               mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+               rrc = kgnilnd_ep_destroy(tmp_ep);
+
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+               /* if this fails, it could hork up kgni smsg retransmit and others
+                * since we could free the SMSG mbox memory, etc. */
+               LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d conn 0x%p ep 0x%p\n",
+                        rrc, conn, conn->gnc_ephandle);
+
+               atomic_dec(&conn->gnc_device->gnd_neps);
+
+               /* clear out count added in kgnilnd_close_conn_locked
+                * conn will have a peer once it hits finish_connect, where it
+                * is the first spot we'll mark it ESTABLISHED as well */
+               if (conn->gnc_peer) {
+                       kgnilnd_admin_decref(conn->gnc_peer->gnp_dirty_eps);
+               }
+
+               /* drop ref for EP */
+               kgnilnd_conn_decref(conn);
+       }
+}
+
+void
+kgnilnd_destroy_conn(kgn_conn_t *conn)
+{
+       LASSERTF(!in_interrupt() &&
+               !conn->gnc_scheduled &&
+               !conn->gnc_in_purgatory &&
+               conn->gnc_ephandle == NULL &&
+               list_empty(&conn->gnc_list) &&
+               list_empty(&conn->gnc_hashlist) &&
+               list_empty(&conn->gnc_schedlist) &&
+               list_empty(&conn->gnc_mdd_list),
+               "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p lists %d/%d/%d/%d\n",
+               conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
+                                    : "<?>",
+               !!in_interrupt(), conn->gnc_scheduled,
+               conn->gnc_in_purgatory,
+               conn->gnc_ephandle,
+               list_empty(&conn->gnc_list),
+               list_empty(&conn->gnc_hashlist),
+               list_empty(&conn->gnc_schedlist),
+               list_empty(&conn->gnc_mdd_list));
+
+       /* Tripping these is especially bad, as it means we have items on the
+        *  lists that didn't keep their refcount on the connection - or
+        *  somebody evil released their own */
+       LASSERTF(list_empty(&conn->gnc_fmaq) &&
+                atomic_read(&conn->gnc_nlive_fma) == 0 &&
+                atomic_read(&conn->gnc_nlive_rdma) == 0,
+                "conn 0x%p fmaq %d@0x%p nfma %d nrdma %d\n",
+                conn, kgnilnd_count_list(&conn->gnc_fmaq), &conn->gnc_fmaq,
+                atomic_read(&conn->gnc_nlive_fma), atomic_read(&conn->gnc_nlive_rdma));
+
+       CDEBUG(D_NET, "destroying conn %p ephandle %p error %d\n",
+               conn, conn->gnc_ephandle, conn->gnc_error);
+
+       /* if there is an FMA blk left here, we'll tear it down */
+       if (conn->gnc_fma_blk) {
+               kgnilnd_release_mbox(conn, 0);
+       }
+
+       if (conn->gnc_peer != NULL)
+               kgnilnd_peer_decref(conn->gnc_peer);
+
+       if (conn->gnc_tx_ref_table != NULL) {
+               LIBCFS_FREE(conn->gnc_tx_ref_table,
+                           GNILND_MAX_MSG_ID * sizeof(void *));
+       }
+
+       LIBCFS_FREE(conn, sizeof(*conn));
+       atomic_dec(&kgnilnd_data.kgn_nconns);
+}
+
+/* peer_alive and peer_notify done in the style of the o2iblnd */
+void
+kgnilnd_peer_alive(kgn_peer_t *peer)
+{
+       set_mb(peer->gnp_last_alive, jiffies);
+}
+
+void
+kgnilnd_peer_notify(kgn_peer_t *peer, int error)
+{
+       int                     tell_lnet = 0;
+       int                     nnets = 0;
+       int                     rc;
+       int                     i, j;
+       kgn_conn_t             *conn;
+       kgn_net_t             **nets;
+       kgn_net_t              *net;
+
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DONT_NOTIFY))
+               return;
+
+       /* Tell LNet we are giving ups on this peer - but only
+        * if it isn't already reconnected or trying to reconnect */
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* use kgnilnd_find_conn_locked to avoid any conns in the process of being nuked
+        *
+        * don't tell LNet if we are in reset - we assume that everyone will be able to
+        * reconnect just fine
+        */
+       conn = kgnilnd_find_conn_locked(peer);
+
+       CDEBUG(D_NETTRACE, "peer 0x%p->%s ting %d conn 0x%p, rst %d error %d\n",
+              peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting, conn,
+              kgnilnd_data.kgn_in_reset, error);
+
+       if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+           (conn == NULL) &&
+           (!kgnilnd_data.kgn_in_reset) &&
+           (!kgnilnd_conn_clean_errno(error))) {
+               tell_lnet = 1;
+       }
+
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       if (!tell_lnet) {
+               /* short circuit if we dont need to notify Lnet */
+               return;
+       }
+
+       rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+
+       if (rc) {
+           /* dont do this if this fails since LNET is in shutdown or something else
+            */
+
+               for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+                       list_for_each_entry(net , &kgnilnd_data.kgn_nets[i], gnn_list) {
+                               /* if gnn_shutdown set for any net shutdown is in progress just return */
+                               if (net->gnn_shutdown) {
+                                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                                       return;
+                               }
+                               nnets++;
+                       }
+               }
+
+               if (nnets == 0) {
+                       /* shutdown in progress most likely */
+                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                       return;
+               }
+
+               LIBCFS_ALLOC(nets, nnets * sizeof(*nets));
+
+               if (nets == NULL) {
+                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                       CERROR("Failed to allocate nets[%d]\n", nnets);
+                       return;
+               }
+
+               j = 0;
+               for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+                       list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
+                               nets[j] = net;
+                               kgnilnd_net_addref(net);
+                               j++;
+                       }
+               }
+               up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+               for (i = 0; i < nnets; i++) {
+                       lnet_nid_t peer_nid;
+
+                       net = nets[i];
+
+                       peer_nid = kgnilnd_lnd2lnetnid(net->gnn_ni->ni_nid,
+                                                                peer->gnp_nid);
+
+                       CDEBUG(D_NET, "peer 0x%p->%s last_alive %lu (%lus ago)\n",
+                               peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive,
+                               cfs_duration_sec(jiffies - peer->gnp_last_alive));
+
+                       lnet_notify(net->gnn_ni, peer_nid, 0, peer->gnp_last_alive);
+
+
+                       kgnilnd_net_decref(net);
+               }
+
+               LIBCFS_FREE(nets, nnets * sizeof(*nets));
+       }
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_close_conn_locked(kgn_conn_t *conn, int error)
+{
+       kgn_peer_t        *peer = conn->gnc_peer;
+       ENTRY;
+
+       LASSERT(!in_interrupt());
+
+       /* store error for tx completion */
+       conn->gnc_error = error;
+       peer->gnp_last_errno = error;
+
+       /* use real error from peer if possible */
+       if (error == -ECONNRESET) {
+               error = conn->gnc_peer_error;
+       }
+
+       /* if we NETERROR, make sure it is rate limited */
+       if (!kgnilnd_conn_clean_errno(error)) {
+               CNETERR("closing conn to %s: error %d\n",
+                      libcfs_nid2str(peer->gnp_nid), error);
+       } else {
+               CDEBUG(D_NET, "closing conn to %s: error %d\n",
+                      libcfs_nid2str(peer->gnp_nid), error);
+       }
+
+       LASSERTF(conn->gnc_state == GNILND_CONN_ESTABLISHED,
+               "conn %p to %s with bogus state %s\n", conn,
+               libcfs_nid2str(conn->gnc_peer->gnp_nid),
+               kgnilnd_conn_state2str(conn));
+       LASSERT(!list_empty(&conn->gnc_hashlist));
+       LASSERT(!list_empty(&conn->gnc_list));
+
+
+       /* mark peer count here so any place the EP gets destroyed will
+        * open up the peer count so that a new ESTABLISHED conn is then free
+        * to send new messages -- sending before the previous EPs are destroyed
+        * could end up with messages on the network for the old conn _after_
+        * the new conn and break the mbox safety protocol */
+       kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
+
+       /* Remove from conn hash table: no new callbacks */
+       list_del_init(&conn->gnc_hashlist);
+       kgnilnd_data.kgn_conn_version++;
+
+       /* if we are in reset, go right to CLOSED as there is no scheduler
+        * thread to move from CLOSING to CLOSED */
+       if (unlikely(kgnilnd_data.kgn_in_reset)) {
+               conn->gnc_state = GNILND_CONN_CLOSED;
+       } else {
+               conn->gnc_state = GNILND_CONN_CLOSING;
+       }
+
+       /* leave on peer->gnp_conns to make sure we don't let the reaper
+        * or others try to unlink this peer until the conn is fully
+        * processed for closing */
+
+       if (kgnilnd_check_purgatory_conn(conn)) {
+               kgnilnd_add_purgatory_locked(conn, conn->gnc_peer);
+       }
+
+       /* Reset RX timeout to ensure we wait for an incoming CLOSE
+        * for the full timeout.  If we get a CLOSE we know the
+        * peer has stopped all RDMA.  Otherwise if we wait for
+        * the full timeout we can also be sure all RDMA has stopped. */
+       conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
+       mb();
+
+       /* schedule sending CLOSE - if we are in quiesce, this adds to
+        * gnd_ready_conns and allows us to find it in quiesce processing */
+       kgnilnd_schedule_conn(conn);
+
+       /* lose peer's ref */
+       kgnilnd_conn_decref(conn);
+       /* -1 for conn table */
+       kgnilnd_conn_decref(conn);
+
+       EXIT;
+}
+
+void
+kgnilnd_close_conn(kgn_conn_t *conn, int error)
+{
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       /* need to check the state here - this call is racy and we don't
+        * know the state until after the lock is grabbed */
+       if (conn->gnc_state == GNILND_CONN_ESTABLISHED) {
+               kgnilnd_close_conn_locked(conn, error);
+       }
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+}
+
+void
+kgnilnd_complete_closed_conn(kgn_conn_t *conn)
+{
+       LIST_HEAD               (sinners);
+       kgn_tx_t               *tx, *txn;
+       int                     nlive = 0;
+       int                     nrdma = 0;
+       int                     nq_rdma = 0;
+       int                     logmsg;
+       ENTRY;
+
+       /* Dump log  on cksum error - wait until complete phase to let
+        * RX of error happen */
+       if (*kgnilnd_tunables.kgn_checksum_dump &&
+           (conn != NULL && conn->gnc_peer_error == -ENOKEY)) {
+               libcfs_debug_dumplog();
+       }
+
+       /* _CLOSED set in kgnilnd_process_fmaq once we decide to
+        * send the CLOSE or not */
+       LASSERTF(conn->gnc_state == GNILND_CONN_CLOSED,
+                "conn 0x%p->%s with bad state %s\n",
+                conn, conn->gnc_peer ?
+                       libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+                       "<?>",
+                kgnilnd_conn_state2str(conn));
+
+       LASSERT(list_empty(&conn->gnc_hashlist));
+
+       /* we've sent the close, start nuking */
+
+       /* we don't use lists to track things that we can get out of the
+        * tx_ref table... */
+
+       /* need to hold locks for tx_list_state, sampling it is too racy:
+        * - the lock actually protects tx != NULL, but we can't take the proper
+        *   lock until we check tx_list_state, which would be too late and
+        *   we could have the TX change under us.
+        * gnd_rdmaq_lock and gnd_lock and not used together, so taking both
+        * should be fine */
+       spin_lock(&conn->gnc_device->gnd_rdmaq_lock);
+       spin_lock(&conn->gnc_device->gnd_lock);
+
+       for (nrdma = 0; nrdma < GNILND_MAX_MSG_ID; nrdma++) {
+               tx = conn->gnc_tx_ref_table[nrdma];
+
+               if (tx != NULL) {
+                       /* only print the first error and if not CLOSE, we often don't see
+                        * CQ events for that by the time we get here... and really don't care */
+                       if (nlive || tx->tx_msg.gnm_type == GNILND_MSG_CLOSE)
+                               tx->tx_state |= GNILND_TX_QUIET_ERROR;
+                       nlive++;
+                       GNIDBG_TX(D_NET, tx, "cleaning up on close, nlive %d", nlive);
+
+                       /* don't worry about gnc_lock here as nobody else should be
+                        * touching this conn */
+                       kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+                       list_add_tail(&tx->tx_list, &sinners);
+               }
+       }
+       spin_unlock(&conn->gnc_device->gnd_lock);
+       spin_unlock(&conn->gnc_device->gnd_rdmaq_lock);
+
+       /* nobody should have marked this as needing scheduling after
+        * we called close - so only ref should be us handling it */
+       LASSERTF(conn->gnc_scheduled == GNILND_CONN_PROCESS,
+                "conn 0x%p scheduled %d\n", conn, conn->gnc_scheduled);
+
+       /* now reset a few to actual counters... */
+       nrdma = atomic_read(&conn->gnc_nlive_rdma);
+       nq_rdma = atomic_read(&conn->gnc_nq_rdma);
+
+       if (!list_empty(&sinners)) {
+               list_for_each_entry_safe(tx, txn, &sinners, tx_list) {
+                       /* clear tx_list to make tx_add_list_locked happy */
+                       list_del_init(&tx->tx_list);
+                       /* The error codes determine if we hold onto the MDD */
+                       kgnilnd_tx_done(tx, conn->gnc_error);
+               }
+       }
+
+       logmsg = (nlive + nrdma + nq_rdma);
+
+       if (logmsg) {
+               if (conn->gnc_peer_error != 0) {
+                       CNETERR("Closed conn 0x%p->%s (errno %d, peer errno %d): "
+                               "canceled %d TX, %d/%d RDMA\n",
+                               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                               conn->gnc_error, conn->gnc_peer_error,
+                               nlive, nq_rdma, nrdma);
+               } else {
+                       CNETERR("Closed conn 0x%p->%s (errno %d): "
+                               "canceled %d TX, %d/%d RDMA\n",
+                               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                               conn->gnc_error,
+                               nlive, nq_rdma, nrdma);
+               }
+       }
+
+       kgnilnd_destroy_conn_ep(conn);
+
+       /* Bug 765042 - race this with completing a new conn to same peer - we need
+        * finish_connect to detach purgatory before we can do it ourselves here */
+       CFS_RACE(CFS_FAIL_GNI_FINISH_PURG);
+
+       /* now it is safe to remove from peer list - anyone looking at
+        * gnp_conns now is free to unlink if not on purgatory */
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       conn->gnc_state = GNILND_CONN_DONE;
+
+       /* Decrement counter if we are marked by del_conn_or_peers for closing
+        */
+       if (conn->gnc_needs_closing)
+               kgnilnd_admin_decref(kgnilnd_data.kgn_npending_conns);
+
+       /* Remove from peer's list of valid connections if its not in purgatory */
+       if (!conn->gnc_in_purgatory) {
+               list_del_init(&conn->gnc_list);
+       }
+
+       /* NB - only unlinking if we set pending in del_peer_locked from admin or
+        * shutdown */
+       if (kgnilnd_peer_active(conn->gnc_peer) &&
+           conn->gnc_peer->gnp_pending_unlink &&
+           kgnilnd_can_unlink_peer_locked(conn->gnc_peer)) {
+               kgnilnd_unlink_peer_locked(conn->gnc_peer);
+       }
+
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* I'm telling Mommy! - use peer_error if they initiated close */
+       kgnilnd_peer_notify(conn->gnc_peer,
+                           conn->gnc_error == -ECONNRESET ? conn->gnc_peer_error
+                                                          : conn->gnc_error);
+
+       EXIT;
+}
+
+int
+kgnilnd_set_conn_params(kgn_dgram_t *dgram)
+{
+       kgn_conn_t             *conn = dgram->gndg_conn;
+       kgn_connreq_t          *connreq = &dgram->gndg_conn_in;
+       kgn_gniparams_t        *rem_param = &connreq->gncr_gnparams;
+       gni_return_t            rrc;
+       int                     rc = 0;
+
+       /* set timeout vals in conn early so we can use them for the NAK */
+
+       /* use max of the requested and our timeout, peer will do the same */
+       conn->gnc_timeout = MAX(conn->gnc_timeout, connreq->gncr_timeout);
+
+       /* only ep_bind really mucks around with the CQ */
+       /* only ep bind if we are not connecting to ourself and the dstnid is not a wildcard. this check
+        * is necessary as you can only bind an ep once and we must make sure we dont bind when already bound.
+        */
+       if (connreq->gncr_dstnid != LNET_NID_ANY && dgram->gndg_conn_out.gncr_dstnid != connreq->gncr_srcnid) {
+               mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+               rrc = kgnilnd_ep_bind(conn->gnc_ephandle,
+                       connreq->gncr_gnparams.gnpr_host_id,
+                       conn->gnc_cqid);
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+               if (rrc != GNI_RC_SUCCESS) {
+                       rc = -ECONNABORTED;
+                       goto return_out;
+               }
+       }
+
+       rrc = kgnilnd_ep_set_eventdata(conn->gnc_ephandle, conn->gnc_cqid,
+                        connreq->gncr_gnparams.gnpr_cqid);
+       if (rrc != GNI_RC_SUCCESS) {
+               rc = -ECONNABORTED;
+               goto cleanup_out;
+       }
+
+       /* Initialize SMSG */
+       rrc = kgnilnd_smsg_init(conn->gnc_ephandle, &conn->gnpr_smsg_attr,
+                       &connreq->gncr_gnparams.gnpr_smsg_attr);
+       if (unlikely(rrc == GNI_RC_INVALID_PARAM)) {
+               gni_smsg_attr_t *local = &conn->gnpr_smsg_attr;
+               gni_smsg_attr_t *remote = &connreq->gncr_gnparams.gnpr_smsg_attr;
+               /* help folks figure out if there is a tunable off, etc. */
+               LCONSOLE_ERROR("SMSG attribute mismatch. Data from local/remote:"
+                              " type %d/%d msg_maxsize %u/%u"
+                              " mbox_maxcredit %u/%u. Please check kgni"
+                              " logs for further data\n",
+                              local->msg_type, remote->msg_type,
+                              local->msg_maxsize, remote->msg_maxsize,
+                              local->mbox_maxcredit, remote->mbox_maxcredit);
+       }
+       if (rrc != GNI_RC_SUCCESS) {
+               rc = -ECONNABORTED;
+               goto cleanup_out;
+       }
+
+       /* log this for help in debuggin SMSG buffer re-use */
+       CDEBUG(D_NET, "conn %p src %s dst %s smsg %p acquired"
+               " local cqid %u SMSG %p->%u hndl "LPX64"."LPX64
+               " remote cqid %u SMSG %p->%u hndl "LPX64"."LPX64"\n",
+               conn, libcfs_nid2str(connreq->gncr_srcnid),
+               libcfs_nid2str(connreq->gncr_dstnid),
+               &conn->gnpr_smsg_attr,
+               conn->gnc_cqid,
+               conn->gnpr_smsg_attr.msg_buffer,
+               conn->gnpr_smsg_attr.mbox_offset,
+               conn->gnpr_smsg_attr.mem_hndl.qword1,
+               conn->gnpr_smsg_attr.mem_hndl.qword2,
+               rem_param->gnpr_cqid,
+               rem_param->gnpr_smsg_attr.msg_buffer,
+               rem_param->gnpr_smsg_attr.mbox_offset,
+               rem_param->gnpr_smsg_attr.mem_hndl.qword1,
+               rem_param->gnpr_smsg_attr.mem_hndl.qword2);
+
+       conn->gnc_peerstamp = connreq->gncr_peerstamp;
+       conn->gnc_peer_connstamp = connreq->gncr_connstamp;
+
+       /* We update the reaper timeout once we have a valid conn and timeout */
+       kgnilnd_update_reaper_timeout(GNILND_TO2KA(conn->gnc_timeout));
+
+       return 0;
+
+cleanup_out:
+       rrc = kgnilnd_ep_unbind(conn->gnc_ephandle);
+       /* not sure I can just let this fly */
+       LASSERTF(rrc == GNI_RC_SUCCESS,
+               "bad rc from gni_ep_unbind trying to cleanup: %d\n", rrc);
+
+return_out:
+       LASSERTF(rc != 0, "SOFTWARE BUG: rc == 0\n");
+       CERROR("Error setting connection params from %s: %d\n",
+              libcfs_nid2str(connreq->gncr_srcnid), rc);
+       return rc;
+}
+
+/* needs down_read on kgn_net_rw_sem held from before this call until
+ * after the write_lock on kgn_peer_conn_lock - this ensures we stay sane
+ * with kgnilnd_shutdown - it'll get the sem and set shutdown, then get the
+ * kgn_peer_conn_lock to start del_peer'ing. If we hold the sem until after
+ * kgn_peer_conn_lock is held, we guarantee that nobody calls
+ * kgnilnd_add_peer_locked without checking gnn_shutdown */
+int
+kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net)
+{
+       kgn_peer_t    *peer;
+       int            rc;
+
+       LASSERT(nid != LNET_NID_ANY);
+
+       /* We dont pass the net around in the dgram anymore so here is where we find it
+        * this will work unless its in shutdown or the nid has a net that is invalid.
+        * Either way error code needs to be returned in that case.
+        *
+        * If the net passed in is not NULL then we can use it, this alleviates looking it
+        * when the calling function has access to the data.
+        */
+       if (net == NULL) {
+               rc = kgnilnd_find_net(nid, &net);
+               if (rc < 0)
+                       return rc;
+       } else {
+               /* find net adds a reference on the net if we are not using
+                * it we must do it manually so the net references are
+                * correct when tearing down the net
+                */
+               kgnilnd_net_addref(net);
+       }
+
+       LIBCFS_ALLOC(peer, sizeof(*peer));
+       if (peer == NULL) {
+               kgnilnd_net_decref(net);
+               return -ENOMEM;
+       }
+       peer->gnp_nid = nid;
+
+       /* translate from nid to nic addr & store */
+       rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(nid), 1, &peer->gnp_host_id);
+       if (rc <= 0) {
+               kgnilnd_net_decref(net);
+               LIBCFS_FREE(peer, sizeof(*peer));
+               return -ESRCH;
+       }
+       CDEBUG(D_NET, "peer 0x%p->%s -> NIC 0x%x\n", peer,
+               libcfs_nid2str(nid), peer->gnp_host_id);
+
+       atomic_set(&peer->gnp_refcount, 1);     /* 1 ref for caller */
+       atomic_set(&peer->gnp_dirty_eps, 0);
+
+       INIT_LIST_HEAD(&peer->gnp_list);
+       INIT_LIST_HEAD(&peer->gnp_connd_list);
+       INIT_LIST_HEAD(&peer->gnp_conns);
+       INIT_LIST_HEAD(&peer->gnp_tx_queue);
+
+       /* the first reconnect should happen immediately, so we leave
+        * gnp_reconnect_interval set to 0 */
+
+       LASSERTF(net != NULL, "peer 0x%p->%s with NULL net\n",
+                peer, libcfs_nid2str(nid));
+
+       /* must have kgn_net_rw_sem held for this...  */
+       if (net->gnn_shutdown) {
+               /* shutdown has started already */
+               kgnilnd_net_decref(net);
+               LIBCFS_FREE(peer, sizeof(*peer));
+               return -ESHUTDOWN;
+       }
+
+       peer->gnp_net = net;
+
+       atomic_inc(&kgnilnd_data.kgn_npeers);
+
+       *peerp = peer;
+       return 0;
+}
+
+void
+kgnilnd_destroy_peer(kgn_peer_t *peer)
+{
+       CDEBUG(D_NET, "peer %s %p deleted\n",
+              libcfs_nid2str(peer->gnp_nid), peer);
+       LASSERTF(atomic_read(&peer->gnp_refcount) == 0,
+                "peer 0x%p->%s refs %d\n",
+                peer, libcfs_nid2str(peer->gnp_nid),
+                atomic_read(&peer->gnp_refcount));
+       LASSERTF(atomic_read(&peer->gnp_dirty_eps) == 0,
+                "peer 0x%p->%s dirty eps %d\n",
+                peer, libcfs_nid2str(peer->gnp_nid),
+                atomic_read(&peer->gnp_dirty_eps));
+       LASSERTF(peer->gnp_net != NULL, "peer %p (%s) with NULL net\n",
+                peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(!kgnilnd_peer_active(peer),
+                "peer 0x%p->%s\n",
+               peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(peer->gnp_connecting == GNILND_PEER_IDLE || peer->gnp_connecting == GNILND_PEER_KILL,
+                "peer 0x%p->%s, connecting %d\n",
+               peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+       LASSERTF(list_empty(&peer->gnp_conns),
+                "peer 0x%p->%s\n",
+               peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(list_empty(&peer->gnp_tx_queue),
+                "peer 0x%p->%s\n",
+               peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(list_empty(&peer->gnp_connd_list),
+                "peer 0x%p->%s\n",
+               peer, libcfs_nid2str(peer->gnp_nid));
+
+       /* NB a peer's connections keep a reference on their peer until
+        * they are destroyed, so we can be assured that _all_ state to do
+        * with this peer has been cleaned up when its refcount drops to
+        * zero. */
+
+       atomic_dec(&kgnilnd_data.kgn_npeers);
+       kgnilnd_net_decref(peer->gnp_net);
+
+       LIBCFS_FREE(peer, sizeof(*peer));
+}
+
+/* the conn might not have made it all the way through to a connected
+ * state - but we need to purgatory any conn that a remote peer might
+ * have seen through a posted dgram as well */
+void
+kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer)
+{
+       kgn_mbox_info_t *mbox = NULL;
+       ENTRY;
+
+       /* NB - the caller should own conn by removing him from the
+        * scheduler thread when finishing the close */
+
+       LASSERTF(peer != NULL, "conn %p with NULL peer\n", conn);
+
+       /* If this is still true, need to add the calls to unlink back in and
+        * figure out how to close the hole on loopback conns */
+       LASSERTF(kgnilnd_peer_active(peer), "can't use inactive peer %s (%p)"
+               " we'll never recover the resources\n",
+               libcfs_nid2str(peer->gnp_nid), peer);
+
+       CDEBUG(D_NET, "conn %p peer %p dev %p\n", conn, peer,
+               conn->gnc_device);
+
+       /* add ref for mbox purgatory hold */
+       kgnilnd_peer_addref(peer);
+       kgnilnd_conn_addref(conn);
+       conn->gnc_in_purgatory = 1;
+
+       mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+       mbox->mbx_prev_nid = peer->gnp_nid;
+       mbox->mbx_add_purgatory = jiffies;
+       kgnilnd_release_mbox(conn, 1);
+
+       LASSERTF(list_empty(&conn->gnc_mdd_list),
+               "conn 0x%p->%s with active purgatory hold MDD %d\n",
+               conn, libcfs_nid2str(peer->gnp_nid),
+               kgnilnd_count_list(&conn->gnc_mdd_list));
+
+       EXIT;
+}
+
+/* Instead of detaching everything from purgatory here we just mark the conn as needing
+ * detach, when the reaper checks the conn the next time it will detach it.
+ * Calling function requires write_lock held on kgn_peer_conn_lock
+ */
+void
+kgnilnd_mark_for_detach_purgatory_all_locked(kgn_peer_t *peer) {
+       kgn_conn_t       *conn;
+
+       list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+               if (conn->gnc_in_purgatory && !conn->gnc_needs_detach) {
+                       conn->gnc_needs_detach = 1;
+                       kgnilnd_admin_addref(kgnilnd_data.kgn_npending_detach);
+               }
+       }
+}
+
+/* Calling function needs a write_lock held on kgn_peer_conn_lock */
+void
+kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list)
+{
+       kgn_mbox_info_t *mbox = NULL;
+
+       /* if needed, add the conn purgatory data to the list passed in */
+       if (conn->gnc_in_purgatory) {
+               CDEBUG(D_NET, "peer %p->%s purg_conn %p@%s mdd_list #tx %d\n",
+                       conn->gnc_peer, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                       conn, kgnilnd_conn_state2str(conn),
+                       kgnilnd_count_list(&conn->gnc_mdd_list));
+
+               mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+               mbox->mbx_detach_of_purgatory = jiffies;
+
+               /* conn->gnc_list is the entry point on peer->gnp_conns, so detaching it
+                * here removes it from the list of 'valid' peer connections.
+                * We put the current conn onto a list of conns to call kgnilnd_release_purgatory_locked()
+                * and as such the caller of kgnilnd_detach_purgatory_locked() now owns that conn, since its not
+                * on the peer's conn_list anymore.
+                */
+
+               kgnilnd_peer_decref(conn->gnc_peer);
+               list_del_init(&conn->gnc_list);
+
+               /* NB - only unlinking if we set pending in del_peer_locked from admin or
+                * shutdown */
+               if (kgnilnd_peer_active(conn->gnc_peer) &&
+                   conn->gnc_peer->gnp_pending_unlink &&
+                   kgnilnd_can_unlink_peer_locked(conn->gnc_peer)) {
+                       kgnilnd_unlink_peer_locked(conn->gnc_peer);
+               }
+               /* The reaper will not call detach unless the conn is fully through kgnilnd_complete_closed_conn.
+                * If the conn is not in a DONE state somehow we are attempting to detach even though
+                * the conn has not been fully cleaned up. If we detach while the conn is still closing
+                * we will end up with an orphaned connection that has valid ep_handle, that is not on a
+                * peer.
+                */
+
+               LASSERTF(conn->gnc_state == GNILND_CONN_DONE, "Conn in invalid state  %p@%s \n",
+                               conn, kgnilnd_conn_state2str(conn));
+
+               /* move from peer to the delayed release list */
+               list_add_tail(&conn->gnc_list, conn_list);
+       }
+}
+
+void
+kgnilnd_release_purgatory_list(struct list_head *conn_list)
+{
+       kgn_device_t            *dev;
+       kgn_conn_t              *conn, *connN;
+       kgn_mdd_purgatory_t     *gmp, *gmpN;
+
+       list_for_each_entry_safe(conn, connN, conn_list, gnc_list) {
+               dev = conn->gnc_device;
+
+               kgnilnd_release_mbox(conn, -1);
+               conn->gnc_in_purgatory = 0;
+
+               list_del_init(&conn->gnc_list);
+
+               /* gnc_needs_detach is set in kgnilnd_del_conn_or_peer. It is used to keep track
+                * of conns that have been marked for detach by kgnilnd_del_conn_or_peer.
+                * The function uses kgn_npending_detach to verify the conn has
+                * actually been detached.
+                */
+
+               if (conn->gnc_needs_detach)
+                       kgnilnd_admin_decref(kgnilnd_data.kgn_npending_detach);
+
+               /* if this guy is really dead (we are doing release from reaper),
+                * make sure we tell LNet - if this is from other context,
+                * the checks in the function will prevent an errant
+                * notification */
+               kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error);
+
+               list_for_each_entry_safe(gmp, gmpN, &conn->gnc_mdd_list,
+                                        gmp_list) {
+                       CDEBUG(D_NET,
+                              "dev %p releasing held mdd "LPX64"."LPX64"\n",
+                              conn->gnc_device, gmp->gmp_map_key.qword1,
+                              gmp->gmp_map_key.qword2);
+
+                       atomic_dec(&dev->gnd_n_mdd_held);
+                       kgnilnd_mem_mdd_release(conn->gnc_device->gnd_handle,
+                                               &gmp->gmp_map_key);
+                       /* ignoring the return code - if kgni/ghal can't find it
+                        * it must be released already */
+
+                       list_del_init(&gmp->gmp_list);
+                       LIBCFS_FREE(gmp, sizeof(*gmp));
+               }
+               /* lose conn ref for purgatory */
+               kgnilnd_conn_decref(conn);
+       }
+}
+
+/* needs write_lock on kgnilnd_data.kgn_peer_conn_lock held */
+void
+kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer)
+{
+       int current_to;
+
+       current_to = peer->gnp_reconnect_interval;
+
+       /* we'll try to reconnect fast the first time, then back-off */
+       if (current_to == 0) {
+               peer->gnp_reconnect_time = jiffies - 1;
+               current_to = *kgnilnd_tunables.kgn_min_reconnect_interval;
+       } else {
+               peer->gnp_reconnect_time = jiffies + cfs_time_seconds(current_to);
+               /* add 50% of min timeout & retry */
+               current_to += *kgnilnd_tunables.kgn_min_reconnect_interval / 2;
+       }
+
+       current_to = MIN(current_to,
+                               *kgnilnd_tunables.kgn_max_reconnect_interval);
+
+       peer->gnp_reconnect_interval = current_to;
+       CDEBUG(D_NET, "peer %s can reconnect at %lu interval %lu\n",
+              libcfs_nid2str(peer->gnp_nid), peer->gnp_reconnect_time,
+              peer->gnp_reconnect_interval);
+}
+
+/* needs kgnilnd_data.kgn_peer_conn_lock held */
+kgn_peer_t *
+kgnilnd_find_peer_locked(lnet_nid_t nid)
+{
+       struct list_head *peer_list = kgnilnd_nid2peerlist(nid);
+       kgn_peer_t       *peer;
+
+       /* Chopping nid down to only NIDADDR using LNET_NIDADDR so we only
+        * have a single peer per device instead of a peer per nid/net combo.
+        */
+
+       list_for_each_entry(peer, peer_list, gnp_list) {
+               if (LNET_NIDADDR(nid) != LNET_NIDADDR(peer->gnp_nid))
+                       continue;
+
+               CDEBUG(D_NET, "got peer [%p] -> %s c %d (%d)\n",
+                      peer, libcfs_nid2str(nid),
+                      peer->gnp_connecting,
+                      atomic_read(&peer->gnp_refcount));
+               return peer;
+       }
+       return NULL;
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_unlink_peer_locked(kgn_peer_t *peer)
+{
+       LASSERTF(list_empty(&peer->gnp_conns),
+               "peer 0x%p->%s\n",
+                peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(list_empty(&peer->gnp_tx_queue),
+               "peer 0x%p->%s\n",
+                peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(kgnilnd_peer_active(peer),
+               "peer 0x%p->%s\n",
+                peer, libcfs_nid2str(peer->gnp_nid));
+       CDEBUG(D_NET, "unlinking peer 0x%p->%s\n",
+               peer, libcfs_nid2str(peer->gnp_nid));
+
+       list_del_init(&peer->gnp_list);
+       kgnilnd_data.kgn_peer_version++;
+       kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
+       /* lose peerlist's ref */
+       kgnilnd_peer_decref(peer);
+}
+
+int
+kgnilnd_get_peer_info(int index,
+                     kgn_peer_t **found_peer,
+                     lnet_nid_t *id, __u32 *nic_addr,
+                     int *refcount, int *connecting)
+{
+       struct list_head  *ptmp;
+       kgn_peer_t        *peer;
+       int               i;
+       int               rc = -ENOENT;
+
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+
+               list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) {
+                       peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+                       if (peer->gnp_nid != *id)
+                               continue;
+
+                       if (index-- > 0)
+                               continue;
+
+                       CDEBUG(D_NET, "found peer %p (%s) at index %d\n",
+                              peer, libcfs_nid2str(peer->gnp_nid), index);
+
+                       *found_peer  = peer;
+                       *id          = peer->gnp_nid;
+                       *nic_addr    = peer->gnp_host_id;
+                       *refcount    = atomic_read(&peer->gnp_refcount);
+                       *connecting  = peer->gnp_connecting;
+
+                       rc = 0;
+                       goto out;
+               }
+       }
+out:
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       if (rc)
+               CDEBUG(D_NET, "no gni peer at index %d\n", index);
+       return rc;
+}
+
+/* requires write_lock on kgn_peer_conn_lock held */
+void
+kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp)
+{
+       kgn_peer_t        *peer, *peer2;
+
+       LASSERTF(new_stub_peer != NULL, "bad stub peer for nid %s\n",
+                libcfs_nid2str(nid));
+
+       peer2 = kgnilnd_find_peer_locked(nid);
+       if (peer2 != NULL) {
+               /* A peer was created during the lock transition, so drop
+                * the new one we created */
+               kgnilnd_peer_decref(new_stub_peer);
+               peer = peer2;
+       } else {
+               peer = new_stub_peer;
+               /* peer table takes existing ref on peer */
+
+               LASSERTF(!kgnilnd_peer_active(peer),
+                       "peer 0x%p->%s already in peer table\n",
+                       peer, libcfs_nid2str(peer->gnp_nid));
+               list_add_tail(&peer->gnp_list,
+                             kgnilnd_nid2peerlist(nid));
+               kgnilnd_data.kgn_peer_version++;
+       }
+
+       LASSERTF(peer->gnp_net != NULL, "peer 0x%p->%s with NULL net\n",
+                peer, libcfs_nid2str(peer->gnp_nid));
+       *peerp = peer;
+}
+
+int
+kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp)
+{
+       kgn_peer_t        *peer;
+       int                rc;
+       ENTRY;
+
+       if (nid == LNET_NID_ANY)
+               return -EINVAL;
+
+       /* NB - this will not block during normal operations -
+        * the only writer of this is in the startup/shutdown path. */
+       rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+       if (!rc) {
+               rc = -ESHUTDOWN;
+               RETURN(rc);
+       }
+       rc = kgnilnd_create_peer_safe(&peer, nid, net);
+       if (rc != 0) {
+               up_read(&kgnilnd_data.kgn_net_rw_sem);
+               RETURN(rc);
+       }
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+       kgnilnd_add_peer_locked(nid, peer, peerp);
+
+       CDEBUG(D_NET, "peer 0x%p->%s connecting %d\n",
+              peerp, libcfs_nid2str((*peerp)->gnp_nid),
+              (*peerp)->gnp_connecting);
+
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       RETURN(0);
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies)
+{
+       kgn_tx_t        *tx, *txn;
+
+       /* we do care about state of gnp_connecting - we could be between
+        * reconnect attempts, so try to find the dgram and cancel the TX
+        * anyways. If we are in the process of posting DONT do anything;
+        * once it fails or succeeds we can nuke the connect attempt.
+        * We have no idea where in kgnilnd_post_dgram we are so we cant
+        * attempt to cancel until the function is done.
+        */
+
+       /* make sure peer isn't in process of connecting or waiting for connect*/
+       spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+       if (!(list_empty(&peer->gnp_connd_list))) {
+               list_del_init(&peer->gnp_connd_list);
+               /* remove connd ref */
+               kgnilnd_peer_decref(peer);
+       }
+       spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+
+       if (peer->gnp_connecting == GNILND_PEER_POSTING || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
+               peer->gnp_connecting = GNILND_PEER_NEEDS_DEATH;
+               /* We are in process of posting right now the xchg set it up for us to
+                * cancel the connect so we are finished for now */
+       } else {
+               /* no need for exchange we have the peer lock and its ready for us to nuke */
+               LASSERTF(peer->gnp_connecting != GNILND_PEER_POSTING,
+                       "Peer in invalid state 0x%p->%s, connecting %d\n",
+                       peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+               peer->gnp_connecting = GNILND_PEER_IDLE;
+               set_mb(peer->gnp_last_dgram_errno, -ETIMEDOUT);
+               kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
+                                                     peer->gnp_nid);
+       }
+
+       /* The least we can do is nuke the tx's no matter what.... */
+       list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
+               kgnilnd_tx_del_state_locked(tx, peer, NULL,
+                                          GNILND_TX_ALLOCD);
+               list_add_tail(&tx->tx_list, zombies);
+       }
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_del_peer_locked(kgn_peer_t *peer, int error)
+{
+       /* this peer could be passive and only held for purgatory,
+        * take a ref to ensure it doesn't disappear in this function */
+       kgnilnd_peer_addref(peer);
+
+       CFS_RACE(CFS_FAIL_GNI_FIND_TARGET);
+
+       /* if purgatory release cleared it out, don't try again */
+       if (kgnilnd_peer_active(peer)) {
+               /* always do this to allow kgnilnd_start_connect and
+                * kgnilnd_finish_connect to catch this before they
+                * wrap up their operations */
+               if (kgnilnd_can_unlink_peer_locked(peer)) {
+                       /* already released purgatory, so only active
+                        * conns hold it */
+                       kgnilnd_unlink_peer_locked(peer);
+               } else {
+                       kgnilnd_close_peer_conns_locked(peer, error);
+                       /* peer unlinks itself when last conn is closed */
+               }
+       }
+
+       /* we are done, release back to the wild */
+       kgnilnd_peer_decref(peer);
+}
+
+int
+kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command,
+                         int error)
+{
+       LIST_HEAD               (souls);
+       LIST_HEAD               (zombies);
+       struct list_head        *ptmp, *pnxt;
+       kgn_peer_t              *peer;
+       int                     lo;
+       int                     hi;
+       int                     i;
+       int                     rc = -ENOENT;
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       if (nid != LNET_NID_ANY)
+               lo = hi = kgnilnd_nid2peerlist(nid) - kgnilnd_data.kgn_peers;
+       else {
+               lo = 0;
+               hi = *kgnilnd_tunables.kgn_peer_hash_size - 1;
+               /* wildcards always succeed */
+               rc = 0;
+       }
+
+       for (i = lo; i <= hi; i++) {
+               list_for_each_safe(ptmp, pnxt, &kgnilnd_data.kgn_peers[i]) {
+                       peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+                       LASSERTF(peer->gnp_net != NULL,
+                               "peer %p (%s) with NULL net\n",
+                                peer, libcfs_nid2str(peer->gnp_nid));
+
+                       if (net != NULL && peer->gnp_net != net)
+                               continue;
+
+                       if (!(nid == LNET_NID_ANY || LNET_NIDADDR(peer->gnp_nid) == LNET_NIDADDR(nid)))
+                               continue;
+
+                       /* In both cases, we want to stop any in-flight
+                        * connect attempts */
+                       kgnilnd_cancel_peer_connect_locked(peer, &zombies);
+
+                       switch (command) {
+                       case GNILND_DEL_CONN:
+                               kgnilnd_close_peer_conns_locked(peer, error);
+                               break;
+                       case GNILND_DEL_PEER:
+                               peer->gnp_pending_unlink = 1;
+                               kgnilnd_admin_addref(kgnilnd_data.kgn_npending_unlink);
+                               kgnilnd_mark_for_detach_purgatory_all_locked(peer);
+                               kgnilnd_del_peer_locked(peer, error);
+                               break;
+                       case GNILND_CLEAR_PURGATORY:
+                               /* Mark everything ready for detach reaper will cleanup
+                                * once we release the kgn_peer_conn_lock
+                                */
+                               kgnilnd_mark_for_detach_purgatory_all_locked(peer);
+                               peer->gnp_last_errno = -EISCONN;
+                               /* clear reconnect so he can reconnect soon */
+                               peer->gnp_reconnect_time = 0;
+                               peer->gnp_reconnect_interval = 0;
+                               break;
+                       default:
+                               CERROR("bad command %d\n", command);
+                               LBUG();
+                       }
+                       /* we matched something */
+                       rc = 0;
+               }
+       }
+
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* release all of the souls found held in purgatory */
+       kgnilnd_release_purgatory_list(&souls);
+
+       /* nuke peer TX */
+       kgnilnd_txlist_done(&zombies, error);
+
+       /* This function does not return until the commands it initiated have completed,
+        * since they have to work there way through the other threads. In the case of shutdown
+        * threads are not woken up until after this call is initiated so we cannot wait, we just
+        * need to return. The same applies for stack reset we shouldnt wait as the reset thread
+        * handles closing.
+        */
+
+       CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
+
+       if (error == -ENOTRECOVERABLE || error == -ESHUTDOWN) {
+               return rc;
+       }
+
+       i = 4;
+       while (atomic_read(&kgnilnd_data.kgn_npending_conns)   ||
+              atomic_read(&kgnilnd_data.kgn_npending_detach)  ||
+              atomic_read(&kgnilnd_data.kgn_npending_unlink)) {
+
+               cfs_pause(cfs_time_seconds(1));
+               i++;
+
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, "Waiting on %d peers %d closes %d detaches\n",
+                               atomic_read(&kgnilnd_data.kgn_npending_unlink),
+                               atomic_read(&kgnilnd_data.kgn_npending_conns),
+                               atomic_read(&kgnilnd_data.kgn_npending_detach));
+       }
+
+       return rc;
+}
+
+kgn_conn_t *
+kgnilnd_get_conn_by_idx(int index)
+{
+       kgn_peer_t        *peer;
+       struct list_head  *ptmp;
+       kgn_conn_t        *conn;
+       struct list_head  *ctmp;
+       int                i;
+
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+               read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+               list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) {
+
+                       peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+                       list_for_each(ctmp, &peer->gnp_conns) {
+                               conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+                               if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+                                       continue;
+
+                               if (index-- > 0)
+                                       continue;
+
+                               CDEBUG(D_NET, "++conn[%p] -> %s (%d)\n", conn,
+                                      libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                                      atomic_read(&conn->gnc_refcount));
+                               kgnilnd_conn_addref(conn);
+                               read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                               return conn;
+                       }
+               }
+               read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       }
+
+       return NULL;
+}
+
+int
+kgnilnd_get_conn_info(kgn_peer_t *peer,
+                     int *device_id, __u64 *peerstamp,
+                     int *tx_seq, int *rx_seq,
+                     int *fmaq_len, int *nfma, int *nrdma)
+{
+       kgn_conn_t        *conn;
+       int               rc = 0;
+
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       conn = kgnilnd_find_conn_locked(peer);
+       if (conn == NULL) {
+               rc = -ENOENT;
+               goto out;
+       }
+
+       *device_id = conn->gnc_device->gnd_host_id;
+       *peerstamp = conn->gnc_peerstamp;
+       *tx_seq = conn->gnc_tx_seq;
+       *rx_seq = conn->gnc_rx_seq;
+       *fmaq_len = kgnilnd_count_list(&conn->gnc_fmaq);
+       *nfma = atomic_read(&conn->gnc_nlive_fma);
+       *nrdma = atomic_read(&conn->gnc_nlive_rdma);
+out:
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       return rc;
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+int
+kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why)
+{
+       kgn_conn_t         *conn;
+       struct list_head   *ctmp, *cnxt;
+       int                 count = 0;
+
+       list_for_each_safe(ctmp, cnxt, &peer->gnp_conns) {
+               conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+               if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+                       continue;
+
+               count++;
+               /* we mark gnc_needs closing and increment kgn_npending_conns so that
+                * kgnilnd_del_conn_or_peer can wait on the other threads closing
+                * and cleaning up the connection.
+                */
+               if (!conn->gnc_needs_closing) {
+                       conn->gnc_needs_closing = 1;
+                       kgnilnd_admin_addref(kgnilnd_data.kgn_npending_conns);
+               }
+               kgnilnd_close_conn_locked(conn, why);
+       }
+       return count;
+}
+
+int
+kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+       struct libcfs_ioctl_data *data = arg;
+       kgn_net_t                *net = ni->ni_data;
+       int                       rc = -EINVAL;
+
+       LASSERT(ni == net->gnn_ni);
+
+       switch (cmd) {
+       case IOC_LIBCFS_GET_PEER: {
+               lnet_nid_t   nid = 0;
+               kgn_peer_t  *peer = NULL;
+               __u32 nic_addr = 0;
+               __u64 peerstamp = 0;
+               int peer_refcount = 0, peer_connecting = 0;
+               int device_id = 0;
+               int tx_seq = 0, rx_seq = 0;
+               int fmaq_len = 0, nfma = 0, nrdma = 0;
+
+               rc = kgnilnd_get_peer_info(data->ioc_count, &peer,
+                                          &nid, &nic_addr, &peer_refcount,
+                                          &peer_connecting);
+               if (rc)
+                       break;
+
+               /* Barf */
+               /* LNET_MKNID is used to mask from lnet the multiplexing/demultiplexing of connections and peers
+                * LNET assumes a conn and peer per net, the LNET_MKNID/LNET_NIDADDR allows us to let Lnet see what it
+                * wants to see instead of the underlying network that is being used to send the data
+                */
+               data->ioc_nid    = LNET_MKNID(LNET_NIDNET(ni->ni_nid), LNET_NIDADDR(nid));
+               data->ioc_flags  = peer_connecting;
+               data->ioc_count  = peer_refcount;
+
+               rc = kgnilnd_get_conn_info(peer, &device_id, &peerstamp,
+                                          &tx_seq, &rx_seq, &fmaq_len,
+                                          &nfma, &nrdma);
+
+               /* This is allowable - a persistent peer could not
+                * have a connection */
+               if (rc) {
+                       /* flag to indicate we are not connected -
+                        * need to print as such */
+                       data->ioc_flags |= (1<<16);
+                       rc = 0;
+               } else {
+                       /* still barf */
+                       data->ioc_net = device_id;
+                       data->ioc_u64[0] = peerstamp;
+                       data->ioc_u32[0] = fmaq_len;
+                       data->ioc_u32[1] = nfma;
+                       data->ioc_u32[2] = tx_seq;
+                       data->ioc_u32[3] = rx_seq;
+                       data->ioc_u32[4] = nrdma;
+               }
+               break;
+       }
+       case IOC_LIBCFS_ADD_PEER: {
+               /* just dummy value to allow using common interface */
+               kgn_peer_t      *peer;
+               rc = kgnilnd_add_peer(net, data->ioc_nid, &peer);
+               break;
+       }
+       case IOC_LIBCFS_DEL_PEER: {
+               /* NULL is passed in so it affects all peers in existence without regard to network
+                * as the peer may not exist on the network LNET believes it to be on.
+                */
+               rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+                                             GNILND_DEL_PEER, -EUCLEAN);
+               break;
+       }
+       case IOC_LIBCFS_GET_CONN: {
+               kgn_conn_t *conn = kgnilnd_get_conn_by_idx(data->ioc_count);
+
+               if (conn == NULL)
+                       rc = -ENOENT;
+               else {
+                       rc = 0;
+                       /* LNET_MKNID is used to build the correct address based on what LNET wants to see instead of
+                        * the generic connection that is used to send the data
+                        */
+                       data->ioc_nid    = LNET_MKNID(LNET_NIDNET(ni->ni_nid), LNET_NIDADDR(conn->gnc_peer->gnp_nid));
+                       data->ioc_u32[0] = conn->gnc_device->gnd_id;
+                       kgnilnd_conn_decref(conn);
+               }
+               break;
+       }
+       case IOC_LIBCFS_CLOSE_CONNECTION: {
+               /* use error = -ENETRESET to indicate it was lctl disconnect */
+               /* NULL is passed in so it affects all the nets as the connection is virtual
+                * and may not exist on the network LNET believes it to be on.
+                */
+               rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+                                             GNILND_DEL_CONN, -ENETRESET);
+               break;
+       }
+       case IOC_LIBCFS_PUSH_CONNECTION: {
+               /* we use this to flush purgatory */
+               rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+                                             GNILND_CLEAR_PURGATORY, -EUCLEAN);
+               break;
+       }
+       case IOC_LIBCFS_REGISTER_MYNID: {
+               /* Ignore if this is a noop */
+               if (data->ioc_nid == ni->ni_nid) {
+                       rc = 0;
+               } else {
+                       CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                              libcfs_nid2str(data->ioc_nid),
+                              libcfs_nid2str(ni->ni_nid));
+                       rc = -EINVAL;
+               }
+               break;
+       }
+       }
+
+       return rc;
+}
+
+void
+kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+       kgn_net_t               *net = ni->ni_data;
+       kgn_tx_t                *tx;
+       kgn_peer_t              *peer = NULL;
+       kgn_conn_t              *conn = NULL;
+       lnet_process_id_t       id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+       ENTRY;
+
+       /* I expect to find him, so only take a read lock */
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       peer = kgnilnd_find_peer_locked(nid);
+       if (peer != NULL) {
+               /* LIE if in a quiesce - we will update the timeouts after,
+                * but we don't want sends failing during it */
+               if (kgnilnd_data.kgn_quiesce_trigger) {
+                       *when = jiffies;
+                       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       GOTO(out, 0);
+               }
+
+               /* Update to best guess, might refine on later checks */
+               *when = peer->gnp_last_alive;
+
+               /* we have a peer, how about a conn? */
+               conn = kgnilnd_find_conn_locked(peer);
+
+               if (conn == NULL)  {
+                       /* if there is no conn, check peer last errno to see if clean disconnect
+                        * - if it was, we lie to LNet because we believe a TX would complete
+                        * on reconnect */
+                       if (kgnilnd_conn_clean_errno(peer->gnp_last_errno)) {
+                               *when = jiffies;
+                       }
+                       /* we still want to fire a TX and new conn in this case */
+               } else {
+                       /* gnp_last_alive is valid, run for the hills */
+                       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       GOTO(out, 0);
+               }
+       }
+       /* if we get here, either we have no peer or no conn for him, so fire off
+        * new TX to trigger conn setup */
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* if we couldn't find him, we'll fire up a TX and get connected -
+        * if we don't do this, after ni_peer_timeout, LNet will declare him dead.
+        * So really we treat kgnilnd_query as a bit of a 'connect now' type
+        * event because it'll only do this when it wants to send
+        *
+        * Use a real TX for this to get the proper gnp_tx_queue behavior, etc
+        * normally we'd use kgnilnd_send_ctlmsg for this, but we don't really
+        * care that this goes out quickly since we already know we need a new conn
+        * formed */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
+               return;
+
+       tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, ni->ni_nid);
+       if (tx != NULL) {
+               kgnilnd_launch_tx(tx, net, &id);
+       }
+out:
+       CDEBUG(D_NETTRACE, "peer 0x%p->%s when %lu\n", peer,
+              libcfs_nid2str(nid), *when);
+       EXIT;
+}
+
+int
+kgnilnd_dev_init(kgn_device_t *dev)
+{
+       gni_return_t      rrc;
+       int               rc = 0;
+       unsigned int      cq_size;
+       ENTRY;
+
+       /* size of these CQs should be able to accommodate the outgoing
+        * RDMA and SMSG transactions.  Since we really don't know what we
+        * really need here, we'll take credits * 2 * 3 to allow a bunch.
+        * We need to dig into this more with the performance work. */
+       cq_size = *kgnilnd_tunables.kgn_credits * 2 * 3;
+
+       rrc = kgnilnd_cdm_create(dev->gnd_id, *kgnilnd_tunables.kgn_ptag,
+                                GNILND_COOKIE, 0,
+                                &dev->gnd_domain);
+       if (rrc != GNI_RC_SUCCESS) {
+               CERROR("Can't create CDM %d (%d)\n", dev->gnd_id, rrc);
+               rc = -ENODEV;
+               GOTO(failed, rc);
+       }
+
+       rrc = kgnilnd_cdm_attach(dev->gnd_domain, dev->gnd_id,
+                                &dev->gnd_host_id, &dev->gnd_handle);
+       if (rrc != GNI_RC_SUCCESS) {
+               CERROR("Can't attach CDM to device %d (%d)\n",
+                       dev->gnd_id, rrc);
+               rc = -ENODEV;
+               GOTO(failed, rc);
+       }
+
+       rc = kgnilnd_setup_nic_translation(dev->gnd_host_id);
+       if (rc != 0) {
+               rc = -ENODEV;
+               GOTO(failed, rc);
+       }
+
+       /* only dev 0 gets the errors - no need to reset the stack twice
+        * - this works because we have a single PTAG, if we had more
+        * then we'd need to have multiple handlers */
+       if (dev->gnd_id == 0) {
+               rrc = kgnilnd_subscribe_errors(dev->gnd_handle, GNI_ERRMASK_CRITICAL,
+                                             0, NULL, kgnilnd_critical_error,
+                                             &dev->gnd_err_handle);
+               if (rrc != GNI_RC_SUCCESS) {
+                       CERROR("Can't subscribe for errors on device %d: rc %d\n",
+                               dev->gnd_id, rrc);
+                       rc = -ENODEV;
+                       GOTO(failed, rc);
+               }
+
+               rc = kgnilnd_set_quiesce_callback(dev->gnd_handle,
+                                                 kgnilnd_quiesce_end_callback);
+               if (rc != GNI_RC_SUCCESS) {
+                       CERROR("Can't subscribe for quiesce callback on device %d: rc %d\n",
+                               dev->gnd_id, rrc);
+                       rc = -ENODEV;
+                       GOTO(failed, rc);
+               }
+       }
+
+       rc = kgnilnd_nicaddr_to_nid(dev->gnd_host_id, &dev->gnd_nid);
+       if (rc < 0) {
+               /* log messages during startup */
+               if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+                       CERROR("couldn't translate host_id 0x%x to nid. rc %d\n",
+                               dev->gnd_host_id, rc);
+               }
+               rc = -ESRCH;
+               GOTO(failed, rc);
+       }
+       CDEBUG(D_NET, "NIC %x -> NID %d\n", dev->gnd_host_id, dev->gnd_nid);
+
+       rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+                               0, kgnilnd_device_callback,
+                               dev->gnd_id, &dev->gnd_snd_rdma_cqh);
+       if (rrc != GNI_RC_SUCCESS) {
+               CERROR("Can't create rdma send cq size %u for device "
+                      "%d (%d)\n", cq_size, dev->gnd_id, rrc);
+               rc = -EINVAL;
+               GOTO(failed, rc);
+       }
+
+       rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+                       0, kgnilnd_device_callback, dev->gnd_id,
+                       &dev->gnd_snd_fma_cqh);
+       if (rrc != GNI_RC_SUCCESS) {
+               CERROR("Can't create fma send cq size %u for device %d (%d)\n",
+                      cq_size, dev->gnd_id, rrc);
+               rc = -EINVAL;
+               GOTO(failed, rc);
+       }
+
+       /* This one we size differently - overflows are possible and it needs to be
+        * sized based on machine size */
+       rrc = kgnilnd_cq_create(dev->gnd_handle,
+                       *kgnilnd_tunables.kgn_fma_cq_size,
+                       0, kgnilnd_device_callback, dev->gnd_id,
+                       &dev->gnd_rcv_fma_cqh);
+       if (rrc != GNI_RC_SUCCESS) {
+               CERROR("Can't create fma cq size %d for device %d (%d)\n",
+                      *kgnilnd_tunables.kgn_fma_cq_size, dev->gnd_id, rrc);
+               rc = -EINVAL;
+               GOTO(failed, rc);
+       }
+
+       RETURN(0);
+
+failed:
+       kgnilnd_dev_fini(dev);
+       RETURN(rc);
+}
+
+void
+kgnilnd_dev_fini(kgn_device_t *dev)
+{
+       gni_return_t rrc;
+       ENTRY;
+
+       /* At quiesce or rest time, need to loop through and clear gnd_ready_conns ?*/
+       LASSERTF(list_empty(&dev->gnd_ready_conns) &&
+                list_empty(&dev->gnd_map_tx) &&
+                list_empty(&dev->gnd_rdmaq),
+                "dev 0x%p ready_conns %d@0x%p map_tx %d@0x%p rdmaq %d@0x%p\n",
+                dev, kgnilnd_count_list(&dev->gnd_ready_conns), &dev->gnd_ready_conns,
+                kgnilnd_count_list(&dev->gnd_map_tx), &dev->gnd_map_tx,
+                kgnilnd_count_list(&dev->gnd_rdmaq), &dev->gnd_rdmaq);
+
+       /* These should follow from tearing down all connections */
+       LASSERTF(dev->gnd_map_nphys == 0 && dev->gnd_map_physnop == 0,
+               "%d physical mappings of %d pages still mapped\n",
+                dev->gnd_map_nphys, dev->gnd_map_physnop);
+
+       LASSERTF(dev->gnd_map_nvirt == 0 && dev->gnd_map_virtnob == 0,
+               "%d virtual mappings of "LPU64" bytes still mapped\n",
+                dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+
+       LASSERTF(atomic_read(&dev->gnd_n_mdd) == 0 &&
+                atomic_read(&dev->gnd_n_mdd_held) == 0 &&
+                atomic64_read(&dev->gnd_nbytes_map) == 0,
+               "%d SMSG mappings of %ld bytes still mapped or held %d\n",
+                atomic_read(&dev->gnd_n_mdd),
+                atomic64_read(&dev->gnd_nbytes_map), atomic_read(&dev->gnd_n_mdd_held));
+
+       LASSERT(list_empty(&dev->gnd_map_list));
+
+       /* What other assertions needed to ensure all connections torn down ? */
+
+       /* check all counters == 0 (EP, MDD, etc) */
+
+       /* if we are resetting due to quiese (stack reset), don't check
+        * thread states */
+       LASSERTF(kgnilnd_data.kgn_quiesce_trigger ||
+               atomic_read(&kgnilnd_data.kgn_nthreads) == 0,
+               "tried to shutdown with threads active\n");
+
+       if (dev->gnd_rcv_fma_cqh) {
+               rrc = kgnilnd_cq_destroy(dev->gnd_rcv_fma_cqh);
+               LASSERTF(rrc == GNI_RC_SUCCESS,
+                       "bad rc from gni_cq_destroy on rcv_fma_cqh: %d\n", rrc);
+               dev->gnd_rcv_fma_cqh = NULL;
+       }
+
+       if (dev->gnd_snd_rdma_cqh) {
+               rrc = kgnilnd_cq_destroy(dev->gnd_snd_rdma_cqh);
+               LASSERTF(rrc == GNI_RC_SUCCESS,
+                       "bad rc from gni_cq_destroy on send_rdma_cqh: %d\n", rrc);
+               dev->gnd_snd_rdma_cqh = NULL;
+       }
+
+       if (dev->gnd_snd_fma_cqh) {
+               rrc = kgnilnd_cq_destroy(dev->gnd_snd_fma_cqh);
+               LASSERTF(rrc == GNI_RC_SUCCESS,
+                       "bad rc from gni_cq_destroy on snd_fma_cqh: %d\n", rrc);
+               dev->gnd_snd_fma_cqh = NULL;
+       }
+
+       if (dev->gnd_err_handle) {
+               rrc = kgnilnd_release_errors(dev->gnd_err_handle);
+               LASSERTF(rrc == GNI_RC_SUCCESS,
+                       "bad rc from gni_release_errors: %d\n", rrc);
+               dev->gnd_err_handle = NULL;
+       }
+
+       if (dev->gnd_domain) {
+               rrc = kgnilnd_cdm_destroy(dev->gnd_domain);
+               LASSERTF(rrc == GNI_RC_SUCCESS,
+                       "bad rc from gni_cdm_destroy: %d\n", rrc);
+               dev->gnd_domain = NULL;
+       }
+
+       EXIT;
+}
+
+
+int kgnilnd_base_startup(void)
+{
+       struct timeval       tv;
+       int                  pkmem = atomic_read(&libcfs_kmemory);
+       int                  rc;
+       int                  i;
+       kgn_device_t        *dev;
+       struct task_struct  *thrd;
+       ENTRY;
+
+       LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_NOTHING,
+               "init %d\n", kgnilnd_data.kgn_init);
+
+       /* zero pointers, flags etc */
+       memset(&kgnilnd_data, 0, sizeof(kgnilnd_data));
+       memset(&kgnilnd_hssops, 0, sizeof(kgnilnd_hssops));
+
+       /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and
+        * a unique (for all time) connstamp so we can uniquely identify
+        * the sender.  The connstamp is an incrementing counter
+        * initialised with seconds + microseconds at startup time.  So we
+        * rely on NOT creating connections more frequently on average than
+        * 1MHz to ensure we don't use old connstamps when we reboot. */
+       do_gettimeofday(&tv);
+       kgnilnd_data.kgn_connstamp =
+                kgnilnd_data.kgn_peerstamp =
+                       (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+       init_rwsem(&kgnilnd_data.kgn_net_rw_sem);
+
+       for (i = 0; i < GNILND_MAXDEVS; i++) {
+               kgn_device_t  *dev = &kgnilnd_data.kgn_devices[i];
+
+               dev->gnd_id = i;
+               INIT_LIST_HEAD(&dev->gnd_ready_conns);
+               INIT_LIST_HEAD(&dev->gnd_map_tx);
+               INIT_LIST_HEAD(&dev->gnd_fma_buffs);
+               mutex_init(&dev->gnd_cq_mutex);
+               sema_init(&dev->gnd_fmablk_sem, 1);
+               spin_lock_init(&dev->gnd_fmablk_lock);
+               init_waitqueue_head(&dev->gnd_waitq);
+               init_waitqueue_head(&dev->gnd_dgram_waitq);
+               init_waitqueue_head(&dev->gnd_dgping_waitq);
+               spin_lock_init(&dev->gnd_lock);
+               INIT_LIST_HEAD(&dev->gnd_map_list);
+               spin_lock_init(&dev->gnd_map_lock);
+               atomic_set(&dev->gnd_nfmablk, 0);
+               atomic_set(&dev->gnd_fmablk_vers, 1);
+               atomic_set(&dev->gnd_neps, 0);
+               atomic_set(&dev->gnd_canceled_dgrams, 0);
+               INIT_LIST_HEAD(&dev->gnd_connd_peers);
+               spin_lock_init(&dev->gnd_connd_lock);
+               spin_lock_init(&dev->gnd_dgram_lock);
+               spin_lock_init(&dev->gnd_rdmaq_lock);
+               INIT_LIST_HEAD(&dev->gnd_rdmaq);
+
+               /* alloc & setup nid based dgram table */
+               LIBCFS_ALLOC(dev->gnd_dgrams,
+                           sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+               if (dev->gnd_dgrams == NULL) {
+                       rc = -ENOMEM;
+                       GOTO(failed, rc);
+               }
+
+               for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+                       INIT_LIST_HEAD(&dev->gnd_dgrams[i]);
+               }
+               atomic_set(&dev->gnd_ndgrams, 0);
+
+               /* setup timer for RDMAQ processing */
+               setup_timer(&dev->gnd_rdmaq_timer, kgnilnd_schedule_device_timer,
+                           (unsigned long)dev);
+       }
+
+       /* CQID 0 isn't allowed, set to MAX_MSG_ID - 1 to check for conflicts early */
+       kgnilnd_data.kgn_next_cqid = GNILND_MAX_MSG_ID - 1;
+       kgnilnd_data.kgn_new_min_timeout = *kgnilnd_tunables.kgn_timeout;
+       init_waitqueue_head(&kgnilnd_data.kgn_reaper_waitq);
+       init_waitqueue_head(&kgnilnd_data.kgn_ruhroh_waitq);
+       spin_lock_init(&kgnilnd_data.kgn_reaper_lock);
+
+       sema_init(&kgnilnd_data.kgn_quiesce_sem, 1);
+       atomic_set(&kgnilnd_data.kgn_nquiesce, 0);
+       atomic_set(&kgnilnd_data.kgn_npending_conns, 0);
+       atomic_set(&kgnilnd_data.kgn_npending_unlink, 0);
+       atomic_set(&kgnilnd_data.kgn_npending_detach, 0);
+       /* OK to call kgnilnd_api_shutdown() to cleanup now */
+       kgnilnd_data.kgn_init = GNILND_INIT_DATA;
+       PORTAL_MODULE_USE;
+
+       rwlock_init(&kgnilnd_data.kgn_peer_conn_lock);
+
+       LIBCFS_ALLOC(kgnilnd_data.kgn_peers,
+                   sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+       if (kgnilnd_data.kgn_peers == NULL) {
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+               INIT_LIST_HEAD(&kgnilnd_data.kgn_peers[i]);
+       }
+
+       LIBCFS_ALLOC(kgnilnd_data.kgn_conns,
+                   sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+       if (kgnilnd_data.kgn_conns == NULL) {
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+               INIT_LIST_HEAD(&kgnilnd_data.kgn_conns[i]);
+       }
+
+       LIBCFS_ALLOC(kgnilnd_data.kgn_nets,
+                   sizeof(struct list_head) * *kgnilnd_tunables.kgn_net_hash_size);
+
+       if (kgnilnd_data.kgn_nets == NULL) {
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+               INIT_LIST_HEAD(&kgnilnd_data.kgn_nets[i]);
+       }
+
+       kgnilnd_data.kgn_mbox_cache =
+               cfs_mem_cache_create("kgn_mbox_block",
+                                    KMALLOC_MAX_SIZE,
+                                    0,    /* offset */
+                                    SLAB_HWCACHE_ALIGN);   /* flags */
+       if (kgnilnd_data.kgn_mbox_cache == NULL) {
+               CERROR("Can't create slab for physical mbox blocks\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       kgnilnd_data.kgn_rx_cache =
+               cfs_mem_cache_create("kgn_rx_t",
+                                    sizeof(kgn_rx_t),
+                                    0,    /* offset */
+                                    0);   /* flags */
+       if (kgnilnd_data.kgn_rx_cache == NULL) {
+               CERROR("Can't create slab for kgn_rx_t descriptors\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       kgnilnd_data.kgn_tx_cache =
+               cfs_mem_cache_create("kgn_tx_t",
+                                    sizeof(kgn_tx_t),
+                                    0,    /* offset */
+                                    0);   /* flags */
+       if (kgnilnd_data.kgn_tx_cache == NULL) {
+               CERROR("Can't create slab for kgn_tx_t\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       kgnilnd_data.kgn_tx_phys_cache =
+               cfs_mem_cache_create("kgn_tx_phys",
+                                    LNET_MAX_IOV * sizeof(gni_mem_segment_t),
+                                    0,    /* offset */
+                                    0);   /* flags */
+       if (kgnilnd_data.kgn_tx_phys_cache == NULL) {
+               CERROR("Can't create slab for kgn_tx_phys\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       kgnilnd_data.kgn_dgram_cache =
+               cfs_mem_cache_create("kgn_dgram_t",
+                                    sizeof(kgn_dgram_t),
+                                    0,    /* offset */
+                                    0);   /* flags */
+       if (kgnilnd_data.kgn_dgram_cache == NULL) {
+               CERROR("Can't create slab for outgoing datagrams\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       /* allocate a MAX_IOV array of page pointers for each cpu */
+       kgnilnd_data.kgn_cksum_map_pages = kmalloc(num_possible_cpus() * sizeof (struct page *),
+                                                  GFP_KERNEL);
+       if (kgnilnd_data.kgn_cksum_map_pages == NULL) {
+               CERROR("Can't allocate vmap cksum pages\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+       kgnilnd_data.kgn_cksum_npages = num_possible_cpus();
+       memset(kgnilnd_data.kgn_cksum_map_pages, 0,
+               kgnilnd_data.kgn_cksum_npages * sizeof (struct page *));
+
+       for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) {
+               kgnilnd_data.kgn_cksum_map_pages[i] = kmalloc(LNET_MAX_IOV * sizeof (struct page *),
+                                                             GFP_KERNEL);
+               if (kgnilnd_data.kgn_cksum_map_pages[i] == NULL) {
+                       CERROR("Can't allocate vmap cksum pages for cpu %d\n", i);
+                       rc = -ENOMEM;
+                       GOTO(failed, rc);
+               }
+       }
+
+       LASSERT(kgnilnd_data.kgn_ndevs == 0);
+
+       /* Use all available GNI devices */
+       for (i = 0; i < GNILND_MAXDEVS; i++) {
+               dev = &kgnilnd_data.kgn_devices[kgnilnd_data.kgn_ndevs];
+
+               rc = kgnilnd_dev_init(dev);
+               if (rc == 0) {
+                       /* Increment here so base_shutdown cleans it up */
+                       kgnilnd_data.kgn_ndevs++;
+
+                       rc = kgnilnd_allocate_phys_fmablk(dev);
+                       if (rc) {
+                               GOTO(failed, rc);
+                       }
+               }
+       }
+
+       if (kgnilnd_data.kgn_ndevs == 0) {
+               CERROR("Can't initialise any GNI devices\n");
+               rc = -ENODEV;
+               GOTO(failed, rc);
+       }
+
+       rc = kgnilnd_thread_start(kgnilnd_reaper, NULL, "kgnilnd_rpr", 0);
+       if (rc != 0) {
+               CERROR("Can't spawn gnilnd reaper: %d\n", rc);
+               GOTO(failed, rc);
+       }
+
+       /*
+        * Start ruhroh thread.  We can't use kgnilnd_thread_start() because
+        * we don't want this thread included in kgnilnd_data.kgn_nthreads
+        * count.  This thread controls quiesce, so it mustn't
+        * quiesce itself.
+        */
+       thrd = kthread_run(kgnilnd_ruhroh_thread, NULL, "%s_%02d", "kgnilnd_rr", 0);
+       if (IS_ERR(thrd)) {
+               rc = PTR_ERR(thrd);
+               CERROR("Can't spawn gnilnd ruhroh thread: %d\n", rc);
+               GOTO(failed, rc);
+       }
+
+       /* threads will load balance across devs as they are available */
+       for (i = 0; i < *kgnilnd_tunables.kgn_sched_threads; i++) {
+               rc = kgnilnd_thread_start(kgnilnd_scheduler, (void *)((long)i),
+                                         "kgnilnd_sd", i);
+               if (rc != 0) {
+                       CERROR("Can't spawn gnilnd scheduler[%d]: %d\n",
+                              i, rc);
+                       GOTO(failed, rc);
+               }
+       }
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               dev = &kgnilnd_data.kgn_devices[i];
+               rc = kgnilnd_thread_start(kgnilnd_dgram_mover, dev,
+                                         "kgnilnd_dg", dev->gnd_id);
+               if (rc != 0) {
+                       CERROR("Can't spawn gnilnd dgram_mover[%d]: %d\n",
+                              dev->gnd_id, rc);
+                       GOTO(failed, rc);
+               }
+
+               rc = kgnilnd_thread_start(kgnilnd_dgram_waitq, dev,
+                                         "kgnilnd_dgn", dev->gnd_id);
+               if (rc != 0) {
+                       CERROR("Can't spawn gnilnd dgram_waitq[%d]: %d\n",
+                               dev->gnd_id, rc);
+                       GOTO(failed, rc);
+               }
+
+               rc = kgnilnd_setup_wildcard_dgram(dev);
+
+               if (rc != 0) {
+                       CERROR("Can't create wildcard dgrams[%d]: %d\n",
+                               dev->gnd_id, rc);
+                       GOTO(failed, rc);
+               }
+       }
+
+
+
+       /* flag everything initialised */
+       kgnilnd_data.kgn_init = GNILND_INIT_ALL;
+       /*****************************************************/
+
+       CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem);
+       RETURN(0);
+
+failed:
+       kgnilnd_base_shutdown();
+       kgnilnd_data.kgn_init = GNILND_INIT_NOTHING;
+       RETURN(rc);
+}
+
+void
+kgnilnd_base_shutdown(void)
+{
+       int           i;
+       ENTRY;
+
+       while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_SHUTDOWN, 1)) {};
+
+       kgnilnd_data.kgn_wc_kill = 1;
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+               kgnilnd_cancel_wc_dgrams(dev);
+               kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN);
+               kgnilnd_wait_for_canceled_dgrams(dev);
+       }
+
+       /* Peer state all cleaned up BEFORE setting shutdown, so threads don't
+        * have to worry about shutdown races.  NB connections may be created
+        * while there are still active connds, but these will be temporary
+        * since peer creation always fails after the listener has started to
+        * shut down.
+        * all peers should have been cleared out on the nets */
+       LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0,
+               "peers left %d\n", atomic_read(&kgnilnd_data.kgn_npeers));
+
+       /* Wait for the ruhroh thread to shut down. */
+       kgnilnd_data.kgn_ruhroh_shutdown = 1;
+       wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
+       i = 2;
+       while (kgnilnd_data.kgn_ruhroh_running != 0) {
+               i++;
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                      "Waiting for ruhroh thread to terminate\n");
+               cfs_pause(cfs_time_seconds(1));
+       }
+
+       /* Flag threads to terminate */
+       kgnilnd_data.kgn_shutdown = 1;
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+
+               /* should clear all the MDDs */
+               kgnilnd_unmap_phys_fmablk(dev);
+
+               kgnilnd_schedule_device(dev);
+               wake_up_all(&dev->gnd_dgram_waitq);
+               wake_up_all(&dev->gnd_dgping_waitq);
+               LASSERT(list_empty(&dev->gnd_connd_peers));
+       }
+
+       spin_lock(&kgnilnd_data.kgn_reaper_lock);
+       wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
+       spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+       /* Wait for threads to exit */
+       i = 2;
+       while (atomic_read(&kgnilnd_data.kgn_nthreads) != 0) {
+               i++;
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                      "Waiting for %d threads to terminate\n",
+                      atomic_read(&kgnilnd_data.kgn_nthreads));
+               cfs_pause(cfs_time_seconds(1));
+       }
+
+       LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0,
+               "peers left %d\n", atomic_read(&kgnilnd_data.kgn_npeers));
+
+       if (kgnilnd_data.kgn_peers != NULL) {
+               for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+                       LASSERT(list_empty(&kgnilnd_data.kgn_peers[i]));
+
+               LIBCFS_FREE(kgnilnd_data.kgn_peers,
+                           sizeof (struct list_head) *
+                           *kgnilnd_tunables.kgn_peer_hash_size);
+       }
+
+       down_write(&kgnilnd_data.kgn_net_rw_sem);
+       if (kgnilnd_data.kgn_nets != NULL) {
+               for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++)
+                       LASSERT(list_empty(&kgnilnd_data.kgn_nets[i]));
+
+               LIBCFS_FREE(kgnilnd_data.kgn_nets,
+                           sizeof (struct list_head) *
+                           *kgnilnd_tunables.kgn_net_hash_size);
+       }
+       up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+       LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
+               "conns left %d\n", atomic_read(&kgnilnd_data.kgn_nconns));
+
+       if (kgnilnd_data.kgn_conns != NULL) {
+               for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+                       LASSERT(list_empty(&kgnilnd_data.kgn_conns[i]));
+
+               LIBCFS_FREE(kgnilnd_data.kgn_conns,
+                           sizeof (struct list_head) *
+                           *kgnilnd_tunables.kgn_peer_hash_size);
+       }
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+               kgnilnd_dev_fini(dev);
+
+               LASSERTF(atomic_read(&dev->gnd_ndgrams) == 0,
+                       "dgrams left %d\n", atomic_read(&dev->gnd_ndgrams));
+
+               if (dev->gnd_dgrams != NULL) {
+                       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+                               LASSERT(list_empty(&dev->gnd_dgrams[i]));
+
+                       LIBCFS_FREE(dev->gnd_dgrams,
+                                   sizeof (struct list_head) *
+                                   *kgnilnd_tunables.kgn_peer_hash_size);
+               }
+
+               kgnilnd_free_phys_fmablk(dev);
+       }
+
+       if (kgnilnd_data.kgn_mbox_cache != NULL) {
+               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_mbox_cache);
+               LASSERTF(i == 0, "rc %d destroying kgn_mbox_cache\n", i);
+       }
+
+       if (kgnilnd_data.kgn_rx_cache != NULL) {
+               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_rx_cache);
+               LASSERTF(i == 0, "rc %d destroying kgn_rx_cache\n", i);
+       }
+
+       if (kgnilnd_data.kgn_tx_cache != NULL) {
+               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_cache);
+               LASSERTF(i == 0, "rc %d destroying kgn_tx_cache\n", i);
+       }
+
+       if (kgnilnd_data.kgn_tx_phys_cache != NULL) {
+               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_phys_cache);
+               LASSERTF(i == 0, "rc %d destroying kgn_tx_phys_cache\n", i);
+       }
+
+       if (kgnilnd_data.kgn_dgram_cache != NULL) {
+               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_dgram_cache);
+               LASSERTF(i == 0, "rc %d destroying kgn_dgram_cache\n", i);
+       }
+
+       if (kgnilnd_data.kgn_cksum_map_pages != NULL) {
+               for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) {
+                       if (kgnilnd_data.kgn_cksum_map_pages[i] != NULL) {
+                               kfree(kgnilnd_data.kgn_cksum_map_pages[i]);
+                       }
+               }
+               kfree(kgnilnd_data.kgn_cksum_map_pages);
+       }
+
+       CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       kgnilnd_data.kgn_init = GNILND_INIT_NOTHING;
+       PORTAL_MODULE_UNUSE;
+
+       EXIT;
+}
+
+int
+kgnilnd_startup(lnet_ni_t *ni)
+{
+       int               rc, devno;
+       kgn_net_t        *net;
+       ENTRY;
+
+       LASSERTF(ni->ni_lnd == &the_kgnilnd,
+               "bad LND 0x%p != the_kgnilnd @ 0x%p\n",
+               ni->ni_lnd, &the_kgnilnd);
+
+       if (kgnilnd_data.kgn_init == GNILND_INIT_NOTHING) {
+               rc = kgnilnd_base_startup();
+               if (rc != 0)
+                       RETURN(rc);
+       }
+
+       /* Serialize with shutdown. */
+       down(&kgnilnd_data.kgn_quiesce_sem);
+
+       LIBCFS_ALLOC(net, sizeof(*net));
+       if (net == NULL) {
+               CERROR("could not allocate net for new interface instance\n");
+               rc = -ENOMEM;
+               /* no need to cleanup the CDM... */
+               GOTO(failed, rc);
+       }
+       INIT_LIST_HEAD(&net->gnn_list);
+       ni->ni_data = net;
+       net->gnn_ni = ni;
+       ni->ni_maxtxcredits = *kgnilnd_tunables.kgn_credits;
+       ni->ni_peertxcredits = *kgnilnd_tunables.kgn_peer_credits;
+
+       if (*kgnilnd_tunables.kgn_peer_health) {
+               int     fudge;
+
+               /* give this a bit of leeway - we don't have a hard timeout
+                * as we only check timeouts periodically - see comment in kgnilnd_reaper */
+               fudge = (GNILND_TO2KA(*kgnilnd_tunables.kgn_timeout) / GNILND_REAPER_NCHECKS);
+
+               ni->ni_peertimeout = *kgnilnd_tunables.kgn_timeout + fudge;
+
+               LCONSOLE_INFO("Enabling LNet peer health for gnilnd, timeout %ds\n",
+                             ni->ni_peertimeout);
+       }
+
+       atomic_set(&net->gnn_refcount, 1);
+
+       /* if we have multiple devices, spread the nets around */
+       net->gnn_netnum = LNET_NETNUM(LNET_NIDNET(ni->ni_nid));
+
+       devno = LNET_NIDNET(ni->ni_nid) % GNILND_MAXDEVS;
+       net->gnn_dev = &kgnilnd_data.kgn_devices[devno];
+
+       /* allocate a 'dummy' cdm for datagram use. We can only have a single
+        * datagram between a nid:inst_id and nid2:inst_id. The fake cdm
+        * give us additional inst_id to use, allowing the datagrams to flow
+        * like rivers of honey and beer */
+
+       /* the instance id for the cdm is the NETNUM offset by MAXDEVS -
+        * ensuring we'll have a unique id */
+
+
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), net->gnn_dev->gnd_nid);
+       CDEBUG(D_NET, "adding net %p nid=%s on dev %d \n",
+               net, libcfs_nid2str(ni->ni_nid), net->gnn_dev->gnd_id);
+       /* until the gnn_list is set, we need to cleanup ourselves as
+        * kgnilnd_shutdown is just gonna get confused */
+
+       down_write(&kgnilnd_data.kgn_net_rw_sem);
+       list_add_tail(&net->gnn_list, kgnilnd_netnum2netlist(net->gnn_netnum));
+       up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+       /* we need a separate thread to call probe_wait_by_id until
+        * we get a function callback notifier from kgni */
+       up(&kgnilnd_data.kgn_quiesce_sem);
+       RETURN(0);
+ failed:
+       up(&kgnilnd_data.kgn_quiesce_sem);
+       kgnilnd_shutdown(ni);
+       RETURN(rc);
+}
+
+void
+kgnilnd_shutdown(lnet_ni_t *ni)
+{
+       kgn_net_t     *net = ni->ni_data;
+       int           i;
+       int           rc;
+       ENTRY;
+
+       CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
+
+       LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_ALL,
+               "init %d\n", kgnilnd_data.kgn_init);
+
+       /* Serialize with startup. */
+       down(&kgnilnd_data.kgn_quiesce_sem);
+       CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       if (net == NULL) {
+               CERROR("got NULL net for ni %p\n", ni);
+               rc = -EINVAL;
+               GOTO(out, rc);
+       }
+
+       LASSERTF(ni == net->gnn_ni,
+               "ni %p gnn_ni %p\n", net, net->gnn_ni);
+
+       ni->ni_data = NULL;
+
+       LASSERT(!net->gnn_shutdown);
+       LASSERTF(atomic_read(&net->gnn_refcount) != 0,
+               "net %p refcount %d\n",
+                net, atomic_read(&net->gnn_refcount));
+
+       if (!list_empty(&net->gnn_list)) {
+               /* serialize with peer creation */
+               down_write(&kgnilnd_data.kgn_net_rw_sem);
+               net->gnn_shutdown = 1;
+               up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+               kgnilnd_cancel_net_dgrams(net);
+
+               kgnilnd_del_conn_or_peer(net, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN);
+
+               /* if we are quiesced, need to wake up - we need those threads
+                * alive to release peers, etc */
+               if (GNILND_IS_QUIESCED) {
+                       set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
+                       kgnilnd_quiesce_wait("shutdown");
+               }
+
+               kgnilnd_wait_for_canceled_dgrams(net->gnn_dev);
+
+               /* We wait until the nets ref's are 1, we will release final ref which is ours
+                * this allows us to make sure everything else is done before we free the
+                * net.
+                */
+               i = 4;
+               while (atomic_read(&net->gnn_refcount) != 1) {
+                       i++;
+                       CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                               "Waiting for %d references to clear on net %d\n",
+                               atomic_read(&net->gnn_refcount),
+                               net->gnn_netnum);
+                       cfs_pause(cfs_time_seconds(1));
+               }
+
+               /* release ref from kgnilnd_startup */
+               kgnilnd_net_decref(net);
+               /* serialize with reaper and conn_task looping */
+               down_write(&kgnilnd_data.kgn_net_rw_sem);
+               list_del_init(&net->gnn_list);
+               up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+       }
+
+       /* not locking, this can't race with writers */
+       LASSERTF(atomic_read(&net->gnn_refcount) == 0,
+               "net %p refcount %d\n",
+                net, atomic_read(&net->gnn_refcount));
+       LIBCFS_FREE(net, sizeof(*net));
+
+out:
+       down_read(&kgnilnd_data.kgn_net_rw_sem);
+       for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+               if (!list_empty(&kgnilnd_data.kgn_nets[i])) {
+                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                       break;
+               }
+
+               if (i == *kgnilnd_tunables.kgn_net_hash_size - 1) {
+                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                       kgnilnd_base_shutdown();
+               }
+       }
+       CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       up(&kgnilnd_data.kgn_quiesce_sem);
+       EXIT;
+       return;
+}
+
+void __exit
+kgnilnd_module_fini(void)
+{
+       lnet_unregister_lnd(&the_kgnilnd);
+       kgnilnd_proc_fini();
+       kgnilnd_remove_sysctl();
+       kgnilnd_tunables_fini();
+}
+
+int __init
+kgnilnd_module_init(void)
+{
+       int    rc;
+
+       rc = kgnilnd_tunables_init();
+       if (rc != 0)
+               return rc;
+
+       printk(KERN_INFO "Lustre: kgnilnd build version: "KGNILND_BUILD_REV"\n");
+
+       kgnilnd_insert_sysctl();
+       kgnilnd_proc_init();
+
+       lnet_register_lnd(&the_kgnilnd);
+
+       return 0;
+}
+
+MODULE_AUTHOR("Cray, Inc. <nic@cray.com>");
+MODULE_DESCRIPTION("Kernel Gemini LND v"KGNILND_BUILD_REV);
+MODULE_LICENSE("GPL");
+
+module_init(kgnilnd_module_init);
+module_exit(kgnilnd_module_fini);
diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h
new file mode 100644 (file)
index 0000000..de43728
--- /dev/null
@@ -0,0 +1,1790 @@
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Derived from work by: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *   Author: James Shimek <jshimek@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_GNILND_H_
+#define _GNILND_GNILND_H_
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/lnet-sysctl.h>
+
+#include <gni_pub.h>
+#include "gnilnd_version.h"
+#include "gnilnd_hss_ops.h"
+
+/* tunables determined at compile time */
+#define GNILND_MIN_TIMEOUT     5               /* minimum timeout interval (seconds) */
+#define GNILND_BASE_TIMEOUT    60              /* default sane timeout */
+#define GNILND_TO2KA(t)                (((t)-1)/2)     /* timeout -> keepalive interval */
+#define GNILND_MIN_RECONNECT_TO        (GNILND_BASE_TIMEOUT/4)
+#define GNILND_MAX_RECONNECT_TO        GNILND_BASE_TIMEOUT
+#define GNILND_HARDWARE_TIMEOUT        15              /* maximum time for data to travel between nodes */
+#define GNILND_MDD_TIMEOUT     15              /* MDD hold timeout in minutes */
+
+/* reaper thread wakup interval */
+#define GNILND_REAPER_THREAD_WAKE  1
+/* reaper thread checks each conn NCHECKS time every kgnilnd_data.kgn_new_min_timeout */
+#define GNILND_REAPER_NCHECKS      4
+
+/* fixed constants */
+#define GNILND_MAXDEVS         1               /* max # of GNI devices currently supported */
+#define GNILND_MBOX_CREDITS    256             /* number of credits per mailbox */
+#define GNILND_COOKIE          0xa3579         /* cookie used by along with ptag by GNI */
+
+/* checksum values */
+#define GNILND_CHECKSUM_OFF            0       /* checksum turned off */
+#define GNILND_CHECKSUM_SMSG_HEADER    1       /* Only checksum SMSG header */
+#define GNILND_CHECKSUM_SMSG           2       /* checksum entire SMSG packet */
+#define GNILND_CHECKSUM_SMSG_BTE       3       /* Full checksum support */
+
+/* tune down some COMPUTE options as they won't see the same number of connections and
+ * don't need the throughput of multiple threads by default */
+#if defined(CONFIG_CRAY_COMPUTE)
+#define GNILND_SCHED_THREADS      1             /* default # of kgnilnd_scheduler threads */
+#define GNILND_FMABLK             64            /* default number of mboxes per fmablk */
+#else
+#define GNILND_SCHED_THREADS      3             /* default # of kgnilnd_scheduler threads */
+#define GNILND_FMABLK             1024          /* default number of mboxes per fmablk */
+#endif
+
+/* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */
+#define GNILND_EXTRA_BITS         1
+/* maximum number of conns & bits for cqid in the SMSG event data */
+#define GNILND_CQID_NBITS         (21 - GNILND_EXTRA_BITS)
+#define GNILND_MSGID_TX_NBITS     (32 - GNILND_CQID_NBITS)
+#define GNILND_MAX_CQID           (1 << GNILND_CQID_NBITS)
+#define GNILND_MAX_MSG_ID         (1 << GNILND_MSGID_TX_NBITS)
+#define GNILND_MAX_MSG_SIZE       (*kgnilnd_tunables.kgn_max_immediate + sizeof(kgn_msg_t))
+
+/* need sane upper bound to limit copy overhead */
+#define GNILND_MAX_IMMEDIATE      (64<<10)
+
+/* payload size to add to the base mailbox size
+ * This is subtracting 2 from the concurrent_sends as 4 messages are included in the size
+ * gni_smsg_buff_size_needed calculates, the MAX_PAYLOAD is added to
+ * the calculation return from that function.*/
+#define GNILND_MBOX_PAYLOAD     \
+         (GNILND_MAX_MSG_SIZE * \
+         ((*kgnilnd_tunables.kgn_concurrent_sends - 2) * 2));
+
+/* timeout -> deadman timer for kgni mdd holds */
+#define GNILND_TIMEOUT2DEADMAN   ((*kgnilnd_tunables.kgn_mdd_timeout) * 1000 * 60)
+
+/* timeout for failing sends in t is in jiffies*/
+#define GNILND_TIMEOUTRX(t)     (t + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout))
+
+/* time when to release from purgatory in the reaper thread in jiffies */
+#define GNILND_PURG_RELEASE(t)   (GNILND_TIMEOUTRX(t) * 3)
+
+/* Macro for finding last_rx 2 datapoints are compared
+ * and the most recent one in jiffies is returned.
+ */
+#define GNILND_LASTRX(conn) (time_after(conn->gnc_last_rx, conn->gnc_last_rx_cq) \
+                               ? conn->gnc_last_rx : conn->gnc_last_rx_cq)
+
+/************************************************************************
+ * Enum, flag and tag data
+ */
+#define GNILND_INIT_NOTHING         0
+#define GNILND_INIT_DATA            1
+#define GNILND_INIT_ALL             2
+
+/* If you change the ordering away from MAPPED = UNMAPPED + 1, things break */
+#define GNILND_BUF_NONE           0              /* buffer type not set */
+#define GNILND_BUF_IMMEDIATE      1              /* immediate data */
+#define GNILND_BUF_IMMEDIATE_KIOV 2              /* immediate data */
+#define GNILND_BUF_PHYS_UNMAPPED  3              /* physical: not mapped yet */
+#define GNILND_BUF_PHYS_MAPPED    4              /* physical: mapped already */
+#define GNILND_BUF_VIRT_UNMAPPED  5              /* virtual: not mapped yet */
+#define GNILND_BUF_VIRT_MAPPED    6              /* virtual: mapped already */
+
+#define GNILND_TX_WAITING_REPLY      (1<<1)     /* expecting to receive reply */
+#define GNILND_TX_WAITING_COMPLETION (1<<2)     /* waiting for smsg_send to complete */
+#define GNILND_TX_PENDING_RDMA       (1<<3)     /* RDMA transaction pending until we get prev. completion */
+#define GNILND_TX_QUIET_ERROR        (1<<4)     /* don't print error on tx_done */
+#define GNILND_TX_FAIL_SMSG          (1<<5)     /* pass down error injection for SMSG fail */
+
+/* stash above max CQID to avoid any collision */
+#define GNILND_MSGID_NOOP           (GNILND_MAX_CQID + 128)
+#define GNILND_MSGID_CLOSE          (GNILND_MSGID_NOOP + 1)
+
+/* kgn_msg_t::gnm_type */
+#define GNILND_MSG_NONE              0x00        /* illegal message */
+#define GNILND_MSG_NOOP              0x01        /* empty gnm_u (keepalive) */
+#define GNILND_MSG_IMMEDIATE         0x02        /* gnm_u.immediate */
+#define GNILND_MSG_PUT_REQ           0x03        /* gnm_u.putreq (src->sink) */
+#define GNILND_MSG_PUT_NAK           0x04        /* gnm_u.completion (no PUT match: sink->src) */
+#define GNILND_MSG_PUT_ACK           0x05        /* gnm_u.putack (PUT matched: sink->src) */
+#define GNILND_MSG_PUT_DONE          0x06        /* gnm_u.completion (src->sink) */
+#define GNILND_MSG_GET_REQ           0x07        /* gnm_u.get (sink->src) */
+#define GNILND_MSG_GET_NAK           0x08        /* gnm_u.completion (no GET match: src->sink) */
+#define GNILND_MSG_GET_DONE          0x09        /* gnm_u.completion (src->sink) */
+#define GNILND_MSG_CLOSE             0x0a        /* empty gnm_u */
+
+/* defines for gnc_*scheduled states */
+#define GNILND_CONN_IDLE             0
+#define GNILND_CONN_SCHED            1
+#define GNILND_CONN_WANTS_SCHED      2
+#define GNILND_CONN_PROCESS          3
+
+#define GNILND_DEV_IDLE              0
+#define GNILND_DEV_IRQ               1
+#define GNILND_DEV_LOOP              2
+
+#define GNILND_DGRAM_IDLE            0
+#define GNILND_DGRAM_SCHED           1
+#define GNILND_DGRAM_PROCESS         2
+
+#define GNILND_PEER_IDLE             0
+#define GNILND_PEER_CONNECT          1
+#define GNILND_PEER_POSTING          2
+#define GNILND_PEER_POSTED           3
+#define GNILND_PEER_NEEDS_DEATH      4
+#define GNILND_PEER_KILL             5
+
+/* for gnc_close_recvd */
+#define GNILND_CLOSE_RX              1
+#define GNILND_CLOSE_INJECT1         2
+#define GNILND_CLOSE_INJECT2         3
+#define GNILND_CLOSE_EARLY           4
+
+/* defines for why quiesce trigger set */
+#define GNILND_QUIESCE_IDLE          0
+#define GNILND_QUIESCE_ADMIN         1
+#define GNILND_QUIESCE_RESET         2
+#define GNILND_QUIESCE_HW_QUIESCE    3
+
+#define GNILND_PEER_CLEAN            0
+#define GNILND_PEER_PERSISTING       1
+
+#define GNILND_DEL_CONN              0
+#define GNILND_DEL_PEER              1
+#define GNILND_CLEAR_PURGATORY       2
+
+typedef enum kgn_fmablk_state {
+       GNILND_FMABLK_IDLE = 0, /* is allocated or ready to be freed */
+       GNILND_FMABLK_PHYS,     /* allocated out of slab of physical memory */
+       GNILND_FMABLK_VIRT,     /* 'standard' vmalloc hunk */
+       GNILND_FMABLK_FREED,    /* after free */
+} kgn_fmablk_state_t;
+
+typedef enum kgn_tx_list_state {
+       GNILND_TX_IDLE = 0,     /* TX is on the idle list, kgn_idle_txs */
+       GNILND_TX_ALLOCD,       /* TX has been alloced (off of idle), could be in any state transition */
+       GNILND_TX_PEERQ,        /* TX on peer->gnp_tx_queue (no live conn) */
+       GNILND_TX_MAPQ,         /* TX on dev:gnd_map_tx for buffer mapping */
+       GNILND_TX_FMAQ,         /* TX waiting to be send on conn FMA */
+       GNILND_TX_LIVE_FMAQ,    /* TX live on the FMA wire, waiting for completion or reply */
+       GNILND_TX_RDMAQ,        /* TX waiting to send FMA confirmation to auth RDMA PUT */
+       GNILND_TX_LIVE_RDMAQ,   /* TX live on the RDMA wire, waiting for completion */
+       GNILND_TX_DYING,        /* TX got caught on MAPQ or RDMAQ while conn was closing, needs someone to call tx_done */
+       GNILND_TX_FREED         /* TX is free! */
+} kgn_tx_list_state_t;
+
+typedef enum kgn_conn_state {
+       /* don't start @ 0 - prevent memset(0) badness */
+       GNILND_CONN_DUMMY = 0,
+       GNILND_CONN_LISTEN,
+       GNILND_CONN_CONNECTING,
+       GNILND_CONN_ESTABLISHED,
+       GNILND_CONN_CLOSING,
+       GNILND_CONN_CLOSED,
+       GNILND_CONN_DONE,
+       GNILND_CONN_DESTROY_EP
+} kgn_conn_state_t;
+
+/* changing these requires a change to GNILND_CONNREQ_VERSION and
+ * will result in dropped packets instead of NAKs. Adding to this is
+ * acceptable without changing the CONNREQ_VERSION, but code should
+ * be ready to handle NAKs on version mismatch  */
+typedef enum kgn_connreq_type {
+       GNILND_CONNREQ_REQ = 1,         /* how YOU doin' ? */
+       GNILND_CONNREQ_NAK,             /* NO soup for you! */
+       GNILND_CONNREQ_CLOSE,           /* we should see other people */
+} kgn_connreq_type_t;
+
+typedef enum kgn_dgram_state {
+       /* don't use 0 to avoid thinking a memset of zero is valid data */
+       GNILND_DGRAM_USED = 1,
+       GNILND_DGRAM_POSTING,
+       GNILND_DGRAM_POSTED,
+       GNILND_DGRAM_PROCESSING,
+       GNILND_DGRAM_CANCELED,
+       GNILND_DGRAM_DONE,
+} kgn_dgram_state_t;
+
+typedef enum kgn_dgram_type {
+       GNILND_DGRAM_REQ = 1,         /* how YOU doin' ? */
+       GNILND_DGRAM_WC_REQ,          /* you talkin' to ME? */
+       GNILND_DGRAM_NAK,             /* NO soup for you! */
+       GNILND_DGRAM_CLOSE,           /* we should see other people */
+} kgn_dgram_type_t;
+
+/************************************************************************
+ * Wire message structs.  These are sent in sender's byte order
+ * (i.e. receiver checks magic and flips if required).
+ */
+
+#define GNILND_MSG_MAGIC     LNET_PROTO_GNI_MAGIC /* unique magic */
+#define GNILND_DGRAM_MAGIC   0x0DDBA11
+
+/*  kgn_msg_t - FMA/SMSG wire struct
+  v2:
+   * - added checksum to FMA
+   * moved seq before paylod
+   * WIRE_ATTR added for alignment
+  v3:
+   * added gnm_payload_len for FMA payload size
+  v4:
+   * added gncm_retval to completion, allowing return code transmission
+     on RDMA NAKs
+  v5:
+   * changed how CQID and TX ids are assigned
+  v6:
+   * added retval on CLOSE
+  v7:
+   * added payload checksumming
+  v8:
+   * reworked checksumming a bit, changed payload checksums
+*/
+#define GNILND_MSG_VERSION              8
+/* kgn_connreq_t connection request datagram wire struct
+  v2:
+   * added NAKs
+*/
+
+#define GNILND_CONNREQ_VERSION          2
+
+typedef struct kgn_gniparams {
+       __u32            gnpr_host_id;          /* ph. host ID of the NIC */
+       __u32            gnpr_cqid;             /* cqid I want peer to use when sending events to me */
+       gni_smsg_attr_t  gnpr_smsg_attr;        /* my short msg. attributes */
+} WIRE_ATTR kgn_gniparams_t;
+
+typedef struct kgn_nak_data {
+       __s32            gnnd_errno;            /* errno reason for NAK */
+
+} WIRE_ATTR kgn_nak_data_t;
+
+/* the first bits of the connreq struct CANNOT CHANGE FORM EVER
+ * without breaking the ability for us to properly NAK someone */
+typedef struct kgn_connreq {                    /* connection request/response */
+       __u32             gncr_magic;           /* I'm an gnilnd connreq */
+       __u32             gncr_cksum;           /* checksum (0 == disabled) */
+       __u16             gncr_type;            /* REQ, NAK, etc */
+       __u16             gncr_version;         /* this is my version number */
+       __u32             gncr_timeout;         /* sender's timeout */
+       __u64             gncr_srcnid;          /* sender's NID */
+       __u64             gncr_dstnid;          /* who sender expects to listen */
+       __u64             gncr_peerstamp;       /* sender's instance stamp */
+       __u64             gncr_connstamp;       /* sender's connection stamp */
+
+       /* everything before this needs to stay static, adding after should
+        * result in a change to GNILND_CONNREQ_VERSION */
+
+       union {
+               kgn_gniparams_t   gncr_gnparams;        /* sender's endpoint info */
+               kgn_nak_data_t    gncr_nakdata;         /* data (rc, etc) for NAK */
+       };
+} WIRE_ATTR kgn_connreq_t;
+
+typedef struct {
+       gni_mem_handle_t  gnrd_key;
+       __u64             gnrd_addr;
+       __u32             gnrd_nob;
+} WIRE_ATTR kgn_rdma_desc_t;
+
+typedef struct {
+       lnet_hdr_t        gnim_hdr;             /* LNet header */
+       /* LNet payload is in FMA "Message Data" */
+} WIRE_ATTR kgn_immediate_msg_t;
+
+typedef struct {
+       lnet_hdr_t        gnprm_hdr;            /* LNet header */
+       __u64             gnprm_cookie;         /* opaque completion cookie */
+} WIRE_ATTR kgn_putreq_msg_t;
+
+typedef struct {
+       __u64             gnpam_src_cookie;     /* reflected completion cookie */
+       __u64             gnpam_dst_cookie;     /* opaque completion cookie */
+       kgn_rdma_desc_t   gnpam_desc;           /* sender's sink buffer */
+} WIRE_ATTR kgn_putack_msg_t;
+
+typedef struct {
+       lnet_hdr_t        gngm_hdr;             /* LNet header */
+       __u64             gngm_cookie;          /* opaque completion cookie */
+       kgn_rdma_desc_t   gngm_desc;            /* sender's sink buffer */
+} WIRE_ATTR kgn_get_msg_t;
+
+typedef struct {
+       int               gncm_retval;          /* error on NAK, size on REQ */
+       __u64             gncm_cookie;          /* reflected completion cookie */
+} WIRE_ATTR kgn_completion_msg_t;
+
+typedef struct {                                /* NB must fit in FMA "Prefix" */
+       __u32             gnm_magic;            /* I'm an gni message */
+       __u16             gnm_version;          /* this is my version number */
+       __u16             gnm_type;             /* msg type */
+       __u64             gnm_srcnid;           /* sender's NID */
+       __u64             gnm_connstamp;        /* sender's connection stamp */
+       __u32             gnm_seq;              /* incrementing sequence number */
+       __u16             gnm_cksum;            /* checksum (0 == no checksum ) */
+       __u16             gnm_payload_cksum;    /* payload checksum (0 == no checksum ) */
+       __u32             gnm_payload_len;      /* size of the FMA payload sent */
+       union {
+               kgn_immediate_msg_t   immediate;
+               kgn_putreq_msg_t      putreq;
+               kgn_putack_msg_t      putack;
+               kgn_get_msg_t         get;
+               kgn_completion_msg_t  completion;
+       } gnm_u;
+} WIRE_ATTR kgn_msg_t;
+
+/************************************************************************
+ * runtime tunable data
+ */
+
+typedef struct kgn_tunables {
+       int              *kgn_min_reconnect_interval; /* connreq starting timeout & retransmit interval */
+       int              *kgn_max_reconnect_interval; /* ...exponentially increasing to this */
+       int              *kgn_credits;          /* # concurrent sends */
+       int              *kgn_fma_cq_size;      /* # entries in receive CQ */
+       int              *kgn_peer_credits;     /* # LNet peer credits */
+       int              *kgn_concurrent_sends; /* max # of max_immediate in mbox */
+       int              *kgn_timeout;          /* comms timeout (seconds) */
+       int              *kgn_max_immediate;    /* immediate payload breakpoint */
+       int              *kgn_checksum;         /* checksum data */
+       int              *kgn_checksum_dump;    /* dump raw data to D_INFO log when checksumming */
+       int              *kgn_bte_hash;         /* hashing on BTE transfers */
+       int              *kgn_bte_adapt;        /* adaptive routing on BTE transfers */
+       int              *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */
+       int              *kgn_ptag;             /* PTAG for cdm_create */
+       int              *kgn_max_retransmits;  /* max number of FMA retransmits */
+       int              *kgn_nwildcard;        /* # wildcard per net to post */
+       int              *kgn_nice;             /* nice value for kgnilnd threads */
+       int              *kgn_rdmaq_intervals;  /* # intervals per second for rdmaq throttle */
+       int              *kgn_loops;            /* # of loops sched does before flush/heartbeat tickle */
+       int              *kgn_peer_hash_size;   /* size of kgn_peers */
+       int              *kgn_peer_health;      /* enable/disable peer health */
+       int              *kgn_vmap_cksum;       /* enable/disable vmap of kiov checksums */
+       int              *kgn_mbox_per_block;   /* mailboxes per fmablk */
+       int              *kgn_nphys_mbox;       /* # mailboxes to preallocate with physical memory */
+       int              *kgn_mbox_credits;     /* max credits per fma */
+       int              *kgn_sched_threads;    /* number of kgnilnd_scheduler threads */
+       int              *kgn_net_hash_size;    /* size of kgn_net_ht */
+       int              *kgn_hardware_timeout; /* max time for a message to get across the network */
+       int              *kgn_mdd_timeout;      /* max time for ghal to hold an mdd in minutes */
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+       cfs_sysctl_table_header_t *kgn_sysctl;  /* sysctl interface */
+#endif
+} kgn_tunables_t;
+
+typedef struct kgn_mbox_info {
+       lnet_nid_t mbx_prev_nid;
+       unsigned long mbx_create_conn_memset;
+       unsigned long mbx_add_purgatory;
+       unsigned long mbx_detach_of_purgatory;
+       unsigned long mbx_release_from_purgatory;
+       unsigned long mbx_release_purg_active_dgram;
+} kgn_mbox_info_t;
+
+typedef struct kgn_fma_memblock {
+       struct list_head    gnm_bufflist;                          /* memblock is part of device's  gnd_fma_buffs */
+       kgn_fmablk_state_t  gnm_state;                             /* how this memory allocated & state of it */
+       int                 gnm_hold_timeout;                      /* hold_timeout if used at unmap time */
+       int                 gnm_num_mboxs;                         /* total mboxes allocated */
+       int                 gnm_avail_mboxs;                       /* number of available mailboxes in the block */
+       int                 gnm_held_mboxs;                        /* number of purgatory held  mailboxes */
+       int                 gnm_mbox_size;                         /* size of the single mailbox */
+       int                 gnm_next_avail_mbox;                   /* next available mailbox */
+       long                gnm_max_timeout;                       /* max timeout for possible purgatory hold */
+       unsigned int        gnm_blk_size;                          /* how big is our hunk o memory ? */
+       void               *gnm_block;                             /* pointer to mem. block */
+       gni_mem_handle_t    gnm_hndl;                              /* mem. handle of the block */
+       unsigned long      *gnm_bit_array;                         /* bit array tracking allocation of mailboxes */
+       kgn_mbox_info_t    *gnm_mbox_info;                         /* array of mbox_information about each mbox */
+} kgn_fma_memblock_t;
+
+typedef struct kgn_device {
+       gni_nic_handle_t        gnd_handle;       /* device handle */
+       gni_cdm_handle_t        gnd_domain;       /* GNI communication domain */
+       gni_err_handle_t        gnd_err_handle;   /* device error handle */
+       unsigned long           gnd_sched_alive;  /* scheduler thread alive stamp */
+       gni_cq_handle_t         gnd_rcv_fma_cqh;  /* FMA rcv. completion queue handle */
+       gni_cq_handle_t         gnd_snd_rdma_cqh; /* rdma send completion queue handle */
+       gni_cq_handle_t         gnd_snd_fma_cqh;  /* rdma send completion queue handle */
+       struct mutex            gnd_cq_mutex;     /* CQ access serialization */
+       __u32                   gnd_host_id;      /* ph. host ID of the NIC */
+       int                     gnd_id;           /* device id, also index in kgn_devices */
+       __u32                   gnd_nid;          /* ph host ID translated to NID */
+       struct list_head        gnd_fma_buffs;    /* list of FMA memory blocks */
+       struct semaphore        gnd_fmablk_sem;   /* semaphore for FMA block memory alloc/free */
+       spinlock_t              gnd_fmablk_lock;  /* lock for mbox alloc/release */
+       atomic_t                gnd_nfmablk;      /* # of fmablk live */
+       atomic_t                gnd_fmablk_vers;  /* gnd_fma_bufs stamp */
+       atomic_t                gnd_neps;         /* # EP allocated to conns */
+       short                   gnd_ready;        /* stuff to do in scheduler thread */
+       struct list_head        gnd_ready_conns;  /* connections ready to tx/rx */
+       struct list_head        gnd_map_tx;       /* TX: needing buffer mapping */
+       wait_queue_head_t       gnd_waitq;        /* scheduler wakeup */
+       spinlock_t              gnd_lock;         /* serialise gnd_ready_conns */
+       struct list_head        gnd_connd_peers;  /* peers waiting for a connection */
+       spinlock_t              gnd_connd_lock;   /* serialise connd_peers */
+       wait_queue_head_t       gnd_dgram_waitq;  /* dgram_mover thread wakeup */
+       wait_queue_head_t       gnd_dgping_waitq; /* dgram thread ping-pong */
+       int                     gnd_dgram_ready;  /* dgrams need movin' */
+       struct list_head       *gnd_dgrams;       /* nid hash to dgrams */
+       atomic_t                gnd_ndgrams;      /* # dgrams extant */
+       spinlock_t              gnd_dgram_lock;   /* serialize gnd_dgrams */
+       struct list_head        gnd_map_list;     /* list of all mapped regions */
+       int                     gnd_map_version;  /* version flag for map list */
+       atomic_t                gnd_n_mdd;        /* number of total MDD - fma, tx, etc */
+       atomic_t                gnd_n_mdd_held;   /* number of total MDD held - fma, tx, etc */
+       atomic_t                gnd_nq_map;       /* # queued waiting for mapping (MDD/GART) */
+       atomic64_t              gnd_nbytes_map;   /* bytes of total GART maps - fma, tx, etc */
+       __u32                   gnd_map_nphys;    /* # TX phys mappings */
+       __u32                   gnd_map_physnop;  /* # TX phys pages mapped */
+       __u32                   gnd_map_nvirt;    /* # TX virt mappings */
+       __u64                   gnd_map_virtnob;  /* # TX virt bytes mapped */
+       spinlock_t              gnd_map_lock;     /* serialize gnd_map_XXX */
+       struct list_head        gnd_rdmaq;        /* RDMA to be sent */
+       spinlock_t              gnd_rdmaq_lock;   /* play nice with others */
+       atomic64_t              gnd_rdmaq_bytes_out; /* # bytes authorized */
+       atomic64_t              gnd_rdmaq_bytes_ok;  /* # bytes allowed until deadline */
+       atomic_t                gnd_rdmaq_nstalls;   /* # stalls due to throttle */
+       unsigned long           gnd_rdmaq_deadline;  /* when does bucket roll over ? */
+       struct timer_list       gnd_rdmaq_timer;     /* wakey-wakey */
+       atomic_t                gnd_short_ntx;      /* TX stats: short messages */
+       atomic64_t              gnd_short_txbytes;  /* TX stats: short message  payload*/
+       atomic_t                gnd_rdma_ntx;       /* TX stats: rdma messages */
+       atomic64_t              gnd_rdma_txbytes;   /* TX stats: rdma message payload*/
+       atomic_t                gnd_short_nrx;      /* RX stats: short messages */
+       atomic64_t              gnd_short_rxbytes;  /* RX stats: short message  payload*/
+       atomic_t                gnd_rdma_nrx;       /* RX stats: rdma messages */
+       atomic64_t              gnd_rdma_rxbytes;   /* RX stats: rdma message payload*/
+       atomic_t                gnd_fast_try;       /* # of times fast send tried */
+       atomic_t                gnd_fast_ok;        /* # of times fast send ok */
+       atomic_t                gnd_fast_block;     /* # of times fast send blocked */
+       unsigned long           gnd_mutex_delay;
+       atomic_t                gnd_n_yield;
+       atomic_t                gnd_n_schedule;
+       atomic_t                gnd_canceled_dgrams; /* # of outstanding cancels */
+} kgn_device_t;
+
+typedef struct kgn_net {
+       struct list_head    gnn_list;           /* chain on kgni_data::kgn_nets */
+       kgn_device_t       *gnn_dev;            /* device for this net */
+       lnet_ni_t          *gnn_ni;             /* network interface instance */
+       atomic_t            gnn_refcount;       /* # current references */
+       int                 gnn_shutdown;       /* lnd_shutdown set */
+       __u16               gnn_netnum;         /* stash netnum for quicker lookup */
+} kgn_net_t;
+
+static inline lnet_nid_t
+kgnilnd_lnd2lnetnid(lnet_nid_t ni_nid, lnet_nid_t kgnilnd_nid)
+{
+       return LNET_MKNID(LNET_NIDNET(ni_nid), LNET_NIDADDR(kgnilnd_nid));
+}
+
+static inline lnet_nid_t
+kgnilnd_lnet2lndnid(lnet_nid_t lnet_nid, lnet_nid_t kgnilnd_nid)
+{
+       return LNET_MKNID(LNET_NIDNET(kgnilnd_nid), LNET_NIDADDR(lnet_nid));
+}
+
+/* The code for this is a bit ugly - but really  this just boils down to a __u64
+ * that can have various parts accessed separately.
+ *
+ * The lower 32 bits is the ID
+ * we give to SMSG for our completion event - it needs to be globally unique across
+ * all TX currently in flight. We separate that out into the CQID so that we can
+ * reference the connection (kgnilnd_cqid2conn_locked) and then the msg_id to pull
+ * the actual TX out of the per-connection gnc_tx_ref_table.
+ *
+ * The upper 32 bits are just extra stuff we put into the cookie to ensure this TX
+ * has a unique value we can send with RDMA setup messages to ensure the completion for
+ * those is unique across the wire. The extra 32 bits are there to ensure that TX id
+ * reuse is separated.
+ */
+
+typedef struct kgn_tx_ev_id {
+       union {
+               __u64             txe_cookie;    /* are you my mommy ? */
+               struct {
+                       __u32     txe_chips;     /* extra bits to ensure ID unique across reuse */
+                       union {
+                               __u32     txe_smsg_id;      /* ID for SMSG CQ event */
+                               /* N.B: Never ever ever ever use the bit shifts directly,
+                                * you are just asking for a world of pain and are at the
+                                * mercy of the compiler layouts */
+                               struct {
+                                       __u32     txe_cqid :GNILND_CQID_NBITS;
+                                       __u32     txe_idx :GNILND_MSGID_TX_NBITS;
+                               };
+                       };
+               };
+       };
+} kgn_tx_ev_id_t;
+
+typedef struct kgn_dgram {
+       struct list_head     gndg_list;          /* on hash dev::gnd_dgrams */
+       kgn_dgram_state_t    gndg_state;         /* state of this dgram */
+       kgn_dgram_type_t     gndg_type;          /* REQ, NAK, etc */
+       __u32                gndg_magic;         /* saftey word */
+       unsigned long        gndg_post_time;     /* time when we posted */
+       struct kgn_conn     *gndg_conn;          /* unbound conn with ep & smsg */
+       kgn_connreq_t        gndg_conn_out;      /* connreq from local node */
+       kgn_connreq_t        gndg_conn_in;       /* connreq from remote node */
+} kgn_dgram_t;
+
+typedef struct kgn_tx {                         /* message descriptor */
+       struct list_head          tx_list;      /* TX queues - peer, conn, rdma */
+       kgn_tx_list_state_t       tx_list_state;/* where in state machine is this TX ? */
+       struct list_head         *tx_list_p;    /* pointer to current list */
+       struct kgn_conn          *tx_conn;      /* owning conn */
+       lnet_msg_t               *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
+       unsigned long             tx_qtime;     /* when tx started to wait for something (jiffies) */
+       unsigned long             tx_cred_wait; /* time spend waiting for smsg creds */
+       struct list_head          tx_map_list;  /* list entry on device map list */
+       unsigned int              tx_nob;       /* # bytes of payload */
+       int                       tx_buftype;   /* payload buffer type */
+       int                       tx_phys_npages; /* # physical pages */
+       gni_mem_handle_t          tx_map_key;   /* mapping key */
+       gni_mem_segment_t        *tx_phys;      /* page descriptors */
+       kgn_msg_t                 tx_msg;       /* FMA message buffer */
+       kgn_tx_ev_id_t            tx_id;        /* who are you, who ? who ? */
+       __u8                      tx_state;     /* state of the descriptor */
+       int                       tx_retrans;   /* retrans count of RDMA */
+       int                       tx_rc;        /* if we need to stash the ret code until we see completion */
+       void                     *tx_buffer;    /* source/sink buffer */
+       union {
+               gni_post_descriptor_t     tx_rdma_desc; /* rdma descriptor */
+               struct page              *tx_imm_pages[GNILND_MAX_IMMEDIATE/PAGE_SIZE];  /* page array to map kiov for immediate send */
+       };
+
+       /* we only use one or the other */
+       union {
+               kgn_putack_msg_t  tx_putinfo;   /* data for differed rdma & re-try */
+               kgn_get_msg_t     tx_getinfo;   /* data for rdma re-try*/
+       };
+} kgn_tx_t;
+
+typedef struct kgn_conn {
+       kgn_device_t       *gnc_device;         /* which device */
+       struct kgn_peer    *gnc_peer;           /* owning peer */
+       struct list_head    gnc_list;           /* stash on peer's conn list - or pending purgatory lists as we clear them */
+       struct list_head    gnc_hashlist;       /* stash in connection hash table */
+       struct list_head    gnc_schedlist;      /* schedule (on gnd_?_conns) for attention */
+       struct list_head    gnc_fmaq;           /* txs queued for FMA */
+       struct list_head    gnc_mdd_list;       /* hold list for MDD on hard conn reset */
+       __u64               gnc_peerstamp;      /* peer's unique stamp */
+       __u64               gnc_peer_connstamp; /* peer's unique connection stamp */
+       __u64               gnc_my_connstamp;   /* my unique connection stamp */
+       unsigned long       gnc_first_rx;       /* when I first received an FMA message (jiffies) */
+       unsigned long       gnc_last_tx;        /* when I last sent an FMA message (jiffies) */
+       unsigned long       gnc_last_rx;        /* when I last sent an FMA message (jiffies) */
+       unsigned long       gnc_last_tx_cq;     /* when I last received an FMA CQ (jiffies) */
+       unsigned long       gnc_last_rx_cq;     /* when I last received an FMA CQ (jiffies) */
+       unsigned long       gnc_last_noop_want; /* time I wanted to send NOOP */
+       unsigned long       gnc_last_noop_sent; /* time I did gni_smsg_send on NOOP */
+       unsigned long       gnc_last_noop_cq;   /* time when NOOP completed */
+       unsigned long       gnc_last_sched_ask; /* time when conn added to ready_conns */
+       unsigned long       gnc_last_sched_do;  /* time when conn processed from ready_conns */
+       atomic_t            gnc_reaper_noop;    /* # reaper triggered NOOP */
+       atomic_t            gnc_sched_noop;     /* # sched triggered NOOP */
+       unsigned int        gnc_timeout;        /* infer peer death if no rx for this many seconds */
+       __u32               gnc_cqid;           /* my completion callback id (non-unique) */
+       __u32               gnc_tx_seq;         /* tx msg sequence number */
+       __u32               gnc_rx_seq;         /* rx msg sequence number */
+       __u64               gnc_tx_retrans;     /* # retrans on SMSG */
+       atomic_t            gnc_nlive_fma;      /* # live FMA */
+       atomic_t            gnc_nq_rdma;        /* # queued (on device) RDMA */
+       atomic_t            gnc_nlive_rdma;     /* # live RDMA */
+       short               gnc_close_sent;     /* I've sent CLOSE */
+       short               gnc_close_recvd;    /* I've received CLOSE */
+       short               gnc_in_purgatory;   /* in the sin bin */
+       int                 gnc_error;          /* errno when conn being closed due to error */
+       int                 gnc_peer_error;     /* errno peer sent us on CLOSE */
+       kgn_conn_state_t    gnc_state;          /* connection state */
+       int                 gnc_scheduled;      /* being attented to */
+       atomic_t            gnc_refcount;       /* # users */
+       spinlock_t          gnc_list_lock;      /* serialise tx lists, max_rx_age */
+       gni_ep_handle_t     gnc_ephandle;       /* GNI endpoint */
+       kgn_fma_memblock_t *gnc_fma_blk;        /* pointer to fma block for our mailbox */
+       gni_smsg_attr_t     gnpr_smsg_attr;     /* my short msg. attributes */
+       spinlock_t          gnc_tx_lock;        /* protect tx alloc/free */
+       __u8                gnc_tx_bits[GNILND_MAX_MSG_ID/8]; /* bit table for tx id */
+       int                 gnc_next_tx;        /* next tx to use in tx_ref_table */
+       kgn_tx_t          **gnc_tx_ref_table;   /* table of TX descriptors for this conn */
+       int                 gnc_mbox_id;        /* id of mbox in fma_blk                 */
+       short               gnc_needs_detach;   /* flag set in detach_purgatory_all_locked so reaper will clear out purgatory */
+       short               gnc_needs_closing;  /* flag set in del_conns when called from kgnilnd_del_peer_or_conn */
+} kgn_conn_t;
+
+typedef struct kgn_mdd_purgatory {
+       gni_mem_handle_t    gmp_map_key;        /* mapping key */
+       struct list_head    gmp_list;           /* entry point for purgatory list */
+} kgn_mdd_purgatory_t;
+
+typedef struct kgn_peer {
+       struct list_head    gnp_list;                   /* stash on global peer list */
+       struct list_head    gnp_connd_list;             /* schedule on kgn_connd_peers */
+       struct list_head    gnp_conns;                  /* all active connections and all conns in purgatory for the peer */
+       struct list_head    gnp_tx_queue;               /* msgs waiting for a conn */
+       kgn_net_t          *gnp_net;                    /* net instance for this peer */
+       lnet_nid_t          gnp_nid;                    /* who's on the other end(s) */
+       atomic_t            gnp_refcount;               /* # users */
+       __u32               gnp_host_id;                /* ph. host ID of the peer */
+       short               gnp_connecting;             /* connection forming */
+       short               gnp_pending_unlink;         /* need last conn close to trigger unlink */
+       int                 gnp_last_errno;             /* last error conn saw */
+       unsigned long       gnp_last_alive;             /* last time I had valid comms */
+       int                 gnp_last_dgram_errno;       /* last error dgrams saw */
+       unsigned long       gnp_last_dgram_time;        /* last time I tried to connect */
+       unsigned long       gnp_reconnect_time;         /* CURRENT_SECONDS when reconnect OK */
+       unsigned long       gnp_reconnect_interval;     /* exponential backoff */
+       atomic_t            gnp_dirty_eps;              /* # of old but yet to be destroyed EPs from conns */
+} kgn_peer_t;
+
+/* the kgn_rx_t is a struct for handing to LNET as the private pointer for things
+ * like lnet_parse. It allows a single pointer to let us get enough
+ * information in _recv and friends */
+typedef struct kgn_rx {
+       kgn_conn_t              *grx_conn;      /* connection */
+       kgn_msg_t               *grx_msg;       /* message */
+       lnet_msg_t              *grx_lntmsg;    /* lnet msg for this rx (eager only) */
+       int                      grx_eager;     /* if eager, we copied msg to somewhere */
+       struct timespec          grx_received;  /* time this msg received */
+} kgn_rx_t;
+
+typedef struct kgn_data {
+       int                     kgn_init;             /* initialisation state */
+       int                     kgn_shutdown;         /* shut down? */
+       int                     kgn_wc_kill;          /* Should I repost the WC */
+       atomic_t                kgn_nthreads;         /* # live threads */
+       int                     kgn_nresets;          /* number of stack resets */
+       int                     kgn_in_reset;         /* are we in stack reset ? */
+
+       kgn_device_t            kgn_devices[GNILND_MAXDEVS]; /* device/ptag/cq etc */
+       int                     kgn_ndevs;            /* # devices */
+
+       int                     kgn_ruhroh_running;   /* ruhroh thread is running */
+       int                     kgn_ruhroh_shutdown;  /* ruhroh thread should or is shut down */
+       wait_queue_head_t       kgn_ruhroh_waitq;     /* ruhroh thread wakeup */
+       int                     kgn_quiesce_trigger;  /* should we quiesce ? */
+       atomic_t                kgn_nquiesce;         /* how many quiesced ? */
+       struct semaphore        kgn_quiesce_sem;      /* serialize ruhroh task, startup and shutdown */
+       int                     kgn_needs_reset;      /* we need stack reset */
+
+       /* These next three members implement communication from gnilnd into
+        * the ruhroh task.  To ensure correct operation of the task, code that
+        * writes into them must use memory barriers to ensure that the changes
+        * are visible to other cores in the order the members appear below.  */
+       __u32                   kgn_quiesce_secs;     /* seconds to bump timeouts */
+       int                     kgn_bump_info_rdy;    /* we have info needed to bump */
+       int                     kgn_needs_pause;      /* we need to pause for network quiesce */
+
+       struct list_head       *kgn_nets;             /* hashtable of kgn_net instances */
+       struct rw_semaphore     kgn_net_rw_sem;       /* serialise gnn_shutdown, kgn_nets */
+
+       rwlock_t                kgn_peer_conn_lock;   /* stabilize peer/conn ops */
+       struct list_head       *kgn_peers;            /* hash table of all my known peers */
+       atomic_t                kgn_npeers;           /* # peers extant */
+       int                     kgn_peer_version;     /* version flag for peer tables */
+
+       struct list_head       *kgn_conns;            /* conns hashed by cqid */
+       atomic_t                kgn_nconns;           /* # connections extant */
+       __u64                   kgn_peerstamp;        /* when I started up */
+       __u64                   kgn_connstamp;        /* conn stamp generator */
+       int                     kgn_conn_version;     /* version flag for conn tables */
+       int                     kgn_next_cqid;        /* cqid generator */
+
+       long                    kgn_new_min_timeout;  /* minimum timeout on any new conn */
+       wait_queue_head_t       kgn_reaper_waitq;     /* reaper sleeps here */
+       spinlock_t              kgn_reaper_lock;      /* serialise */
+
+       cfs_mem_cache_t        *kgn_rx_cache;         /* rx descriptor space */
+       cfs_mem_cache_t        *kgn_tx_cache;         /* tx descriptor memory */
+       cfs_mem_cache_t        *kgn_tx_phys_cache;    /* tx phys descriptor memory */
+       atomic_t                kgn_ntx;              /* # tx in use */
+       cfs_mem_cache_t        *kgn_dgram_cache;      /* outgoing datagrams */
+
+       struct page          ***kgn_cksum_map_pages;  /* page arrays for mapping pages on checksum */
+       __u64                   kgn_cksum_npages;     /* Number of pages allocated for checksumming */
+       atomic_t                kgn_nvmap_cksum;      /* # times we vmapped for checksums */
+       atomic_t                kgn_nvmap_short;      /* # times we vmapped for short kiov */
+
+       atomic_t                kgn_nkmap_short;      /* # time we kmapped for a short kiov */
+       long                    kgn_rdmaq_override;   /* bytes per second override */
+
+       struct kmem_cache      *kgn_mbox_cache;       /* mailboxes from not-GART */
+
+       atomic_t                kgn_npending_unlink;  /* # of peers pending unlink */
+       atomic_t                kgn_npending_conns;   /* # of conns with pending closes */
+       atomic_t                kgn_npending_detach;  /* # of conns with a pending detach */
+
+} kgn_data_t;
+
+extern kgn_data_t         kgnilnd_data;
+extern kgn_tunables_t     kgnilnd_tunables;
+
+extern void kgnilnd_destroy_peer(kgn_peer_t *peer);
+extern void kgnilnd_destroy_conn(kgn_conn_t *conn);
+extern void kgnilnd_schedule_conn(kgn_conn_t *conn);
+
+static inline int
+kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id)
+{
+       struct task_struct *thrd = kthread_run(fn, arg, "%s_%02d", name, id);
+       if (IS_ERR(thrd))
+               return PTR_ERR(thrd);
+
+       atomic_inc(&kgnilnd_data.kgn_nthreads);
+       return 0;
+}
+
+static inline void
+kgnilnd_thread_fini(void)
+{
+       atomic_dec(&kgnilnd_data.kgn_nthreads);
+}
+
+/* like mutex_trylock but with a jiffies spinner. This is to allow certain
+ * parts of the code to avoid a scheduler trip when the mutex is held
+ *
+ * Try to acquire the mutex atomically for 1 jiffie. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated to the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+static inline int kgnilnd_mutex_trylock(struct mutex *lock)
+{
+       int             ret;
+       unsigned long   timeout;
+
+       LASSERT(!in_interrupt());
+
+       for (timeout = jiffies + 1; time_before(jiffies, timeout);) {
+
+               ret = mutex_trylock(lock);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/* Copied from DEBUG_REQ in Lustre - the dance is needed to save stack space */
+
+extern void
+_kgnilnd_debug_msg(kgn_msg_t *msg,
+               struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_msg(msgdata, mask, cdls, msg, fmt, a...)                \
+do {                                                                          \
+       CFS_CHECK_STACK(msgdata, mask, cdls);                                 \
+                                                                             \
+       if (((mask) & D_CANTMASK) != 0 ||                                     \
+           ((libcfs_debug & (mask)) != 0 &&                                  \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                \
+               _kgnilnd_debug_msg((msg), msgdata, fmt, ##a);                 \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_MSG(level, msg, fmt, args...)                                  \
+do {                                                                          \
+       if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                   \
+           static cfs_debug_limit_state_t cdls;                              \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                \
+           kgnilnd_debug_msg(&msgdata, level, &cdls, msg,                    \
+                             "$$ "fmt" from %s ", ## args,                   \
+                             libcfs_nid2str((msg)->gnm_srcnid));             \
+       } else {                                                              \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                 \
+           kgnilnd_debug_msg(&msgdata, level, NULL, msg,                     \
+                             "$$ "fmt" from %s ", ## args,                   \
+                             libcfs_nid2str((msg)->gnm_srcnid));             \
+       }                                                                     \
+} while (0)
+
+/* user puts 'to nid' in msg for us */
+#define GNIDBG_TOMSG(level, msg, fmt, args...)                                \
+do {                                                                          \
+       if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                   \
+           static cfs_debug_limit_state_t cdls;                              \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                \
+           kgnilnd_debug_msg(&msgdata, level, &cdls, msg,                    \
+                             "$$ "fmt" ", ## args);                          \
+       } else {                                                              \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                 \
+           kgnilnd_debug_msg(&msgdata, level, NULL, msg,                     \
+                             "$$ "fmt" ", ## args);                          \
+       }                                                                     \
+} while (0)
+
+extern void
+_kgnilnd_debug_conn(kgn_conn_t *conn,
+               struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_conn(msgdata, mask, cdls, conn, fmt, a...)               \
+do {                                                                           \
+       CFS_CHECK_STACK(msgdata, mask, cdls);                                  \
+                                                                              \
+       if (((mask) & D_CANTMASK) != 0 ||                                      \
+           ((libcfs_debug & (mask)) != 0 &&                                   \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                 \
+               _kgnilnd_debug_conn((conn), msgdata, fmt, ##a);                \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_CONN(level, conn, fmt, args...)                                  \
+do {                                                                            \
+       if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                     \
+           static cfs_debug_limit_state_t cdls;                                \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                  \
+           kgnilnd_debug_conn(&msgdata, level, &cdls, conn,                    \
+                              "$$ "fmt" ", ## args);                           \
+       } else {                                                                \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                   \
+           kgnilnd_debug_conn(&msgdata, level, NULL, conn,                     \
+                              "$$ "fmt" ", ## args);                           \
+       }                                                                       \
+} while (0)
+
+extern void
+_kgnilnd_debug_tx(kgn_tx_t *tx,
+               struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_tx(msgdata, mask, cdls, tx, fmt, a...)                   \
+do {                                                                           \
+       CFS_CHECK_STACK(msgdata, mask, cdls);                                  \
+                                                                              \
+       if (((mask) & D_CANTMASK) != 0 ||                                      \
+           ((libcfs_debug & (mask)) != 0 &&                                   \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                 \
+               _kgnilnd_debug_tx((tx), msgdata, fmt, ##a);                    \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_TX(level, tx, fmt, args...)                                      \
+do {                                                                            \
+       if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                     \
+           static cfs_debug_limit_state_t cdls;                                \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                  \
+           kgnilnd_debug_tx(&msgdata, level, &cdls, tx,                        \
+                             "$$ "fmt" ", ## args);                            \
+       } else {                                                                \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                   \
+           kgnilnd_debug_tx(&msgdata, level, NULL, tx,                         \
+                             "$$ "fmt" ", ## args);                            \
+       }                                                                       \
+} while (0)
+
+#define GNITX_ASSERTF(tx, cond, fmt, a...)                                      \
+({                                                                              \
+       if (unlikely(!(cond))) {                                                \
+               GNIDBG_TX(D_EMERG, tx, "ASSERTION(" #cond ") failed:" fmt, a);  \
+               LBUG();                                                         \
+       }                                                                       \
+})
+
+#define GNILND_IS_QUIESCED                                                      \
+       (atomic_read(&kgnilnd_data.kgn_nquiesce) ==                             \
+               atomic_read(&kgnilnd_data.kgn_nthreads))
+
+#define KGNILND_SPIN_QUIESCE                                                 \
+do {                                                                         \
+       /* E.T phone home */                                                 \
+       atomic_inc(&kgnilnd_data.kgn_nquiesce);                              \
+       CDEBUG(D_NET, "Waiting for thread pause to be over...\n");           \
+       while (kgnilnd_data.kgn_quiesce_trigger) {                           \
+               set_current_state(TASK_INTERRUPTIBLE);                       \
+               cfs_schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,       \
+                       cfs_time_seconds(1));                                \
+       }                                                                    \
+       /* Mom, my homework is done */                                       \
+       CDEBUG(D_NET, "Waking up from thread pause\n");                      \
+       atomic_dec(&kgnilnd_data.kgn_nquiesce);                              \
+} while(0)
+
+/* use macros for addref/decref to get the calling function name in the CDEBUG */
+#ifndef LIBCFS_DEBUG
+#error "this code uses actions inside LASSERT for ref counting"
+#endif
+
+#define kgnilnd_admin_addref(atomic)                                     \
+do {                                                                            \
+       int     val = atomic_inc_return(&atomic);                               \
+       LASSERTF(val > 0,  #atomic " refcount %d\n", val);                       \
+       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);                       \
+} while (0)
+
+#define kgnilnd_admin_decref(atomic)                                     \
+do {                                                                            \
+       int     val = atomic_dec_return(&atomic);                               \
+       LASSERTF(val >=0,  #atomic " refcount %d\n", val);                        \
+       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);                       \
+}while (0)
+
+#define kgnilnd_net_addref(net)                                                 \
+do {                                                                            \
+       int     val = atomic_inc_return(&net->gnn_refcount);                    \
+       LASSERTF(val > 1, "net %p refcount %d\n", net, val);                    \
+       CDEBUG(D_NETTRACE, "net %p->%s++ (%d)\n", net,                          \
+               libcfs_nid2str(net->gnn_ni->ni_nid), val);                      \
+} while (0)
+
+#define kgnilnd_net_decref(net)                                                 \
+do {                                                                            \
+       int     val = atomic_dec_return(&net->gnn_refcount);                    \
+       LASSERTF(val >= 0, "net %p refcount %d\n", net, val);                   \
+       CDEBUG(D_NETTRACE, "net %p->%s-- (%d)\n", net,                          \
+              libcfs_nid2str(net->gnn_ni->ni_nid), val);                       \
+} while (0)
+
+#define kgnilnd_peer_addref(peer)                                               \
+do {                                                                            \
+       int     val = atomic_inc_return(&peer->gnp_refcount);                   \
+       LASSERTF(val > 1, "peer %p refcount %d\n", peer, val);                  \
+       CDEBUG(D_NETTRACE, "peer %p->%s++ (%d)\n", peer,                        \
+              libcfs_nid2str(peer->gnp_nid), val);                             \
+} while (0)
+
+#define kgnilnd_peer_decref(peer)                                               \
+do {                                                                            \
+       int     val = atomic_dec_return(&peer->gnp_refcount);                   \
+       LASSERTF(val >= 0, "peer %p refcount %d\n", peer, val);                 \
+       CDEBUG(D_NETTRACE, "peer %p->%s--(%d)\n", peer,                         \
+              libcfs_nid2str(peer->gnp_nid), val);                             \
+       if (atomic_read(&peer->gnp_refcount) == 0)                              \
+               kgnilnd_destroy_peer(peer);                                     \
+} while(0)
+
+#define kgnilnd_conn_addref(conn)                                       \
+do {                                                                    \
+       int     val;                                                    \
+                                                                       \
+       smp_wmb();                                                      \
+       val = atomic_inc_return(&conn->gnc_refcount);                   \
+       LASSERTF(val >= 0, "conn %p refc %d to %s\n",                   \
+               conn, val,                                              \
+               conn->gnc_peer                                          \
+                       ? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+                       : "<?>");                                       \
+       CDEBUG(D_NETTRACE, "conn %p->%s++ (%d)\n", conn,                \
+               conn->gnc_peer                                          \
+                       ? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+                       : "<?>",                                        \
+               val);                                                   \
+} while (0)
+
+/* we hijack conn_decref && gnc_refcount = 1 to allow us to push the conn
+ * through the scheduler thread to get the EP destroyed. This avoids some
+ * messy semaphore business and allows us to reuse the connd_list and existing
+ * linkage and avoid creating extra lists just for destroying EPs */
+
+/* Safety Disclaimer:
+ * Q: If we decrement the refcount and then check it again, is it possible that
+ *    another caller could have passed through this macro concurrently? If so,
+ *    then it is possible that both will attempt to call kgnilnd_destroy_conn().
+ *
+ * A: Yes, entirely possible in most cases, but we can't get concurrent users
+ * once we are refcount <= 2. It hinges around gnc_state and membership of
+ * gnc_hashlist. There are two ways to find a connection - either ask for
+ * it from the peer, kgnilnd_find_conn_locked(peer) or from the CQ id,
+ * kgnilnd_cqid2conn_locked(id). While a conn is live, we'll have at least
+ * 4 refcounts
+ *
+ * - #1 from create (kgnilnd_create_conn)
+ * - #2 for EP (kgnilnd_create_conn)
+ * - #3 - living on peer (gnc_list, kgnilnd_finish_connect)
+ * - #4 living in global hash (gnc_hashlist, kgnilnd_finish_connect).
+ *
+ * Actually, only 3 live, as at the end of kgnilnd_finish_connect, we drop:
+ * - #1 - the ref the dgram inherited from kgnilnd_create_conn.
+ *
+ * There could be more from TX descriptors during the lifetime of a live
+ * conn.
+ *
+ * If we nuke the conn before finish_connect, we won't have parallel paths
+ * because nobody besides the dgram handler for the single outstanding
+ * dgram can find the connection as it isn't in any searchable tables yet.
+ *
+ * This leaves connection close, we'll drop 2 refs (#4 and #3) but only
+ * after calling kgnilnd_schedule_conn, which would add a new ref (#5). At
+ * this point gnc_refcount=2 (#2, #5). We have a 'maybe' send of the CLOSE
+ * now on the next scheduler loop, this could be #6 (schedule_conn again)
+ * and #7 (TX on gnc_fmaq). Both would be cleared quickly as that TX is
+ * sent. Now the gnc_state == CLOSED, so we hit
+ * kgnilnd_complete_closed_conn. At this point, nobody can 'find' this conn
+ * - we've nuked them from the peer and CQ id tables, so we own them and
+ * are guaranteed serial access - hence the complete lack of conn list
+ * locking in kgnilnd_complete_closed_conn. We are free then to mark the
+ * conn DESTROY_EP (add #6 for schedule_conn), then lose #5 in
+ * kgnilnd_process_conns. Then the next scheduler loop would call
+ * kgnilnd_destroy_conn_ep (drop #2 for EP) and lose #6 (refcount=0) in
+ * kgnilnd_process_conns.
+ *
+ * Clearly, we are totally safe. Clearly.
+ */
+
+#define kgnilnd_conn_decref(conn)                                       \
+do {                                                                    \
+       int     val;                                                    \
+                                                                       \
+       smp_wmb();                                                      \
+       val = atomic_dec_return(&conn->gnc_refcount);                   \
+       LASSERTF(val >= 0, "conn %p refc %d to %s\n",                   \
+               conn, val,                                              \
+               conn->gnc_peer                                          \
+                       ? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+                       : "<?>");                                       \
+       CDEBUG(D_NETTRACE, "conn %p->%s-- (%d)\n", conn,                \
+               conn->gnc_peer                                          \
+                       ? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+                       : "<?>",                                        \
+               val);                                                   \
+       smp_rmb();                                                      \
+       if ((atomic_read(&conn->gnc_refcount) == 1) &&                  \
+           (conn->gnc_ephandle != NULL) &&                             \
+           (conn->gnc_state != GNILND_CONN_DESTROY_EP)) {              \
+               set_mb(conn->gnc_state, GNILND_CONN_DESTROY_EP);        \
+               kgnilnd_schedule_conn(conn);                            \
+       } else if (atomic_read(&conn->gnc_refcount) == 0) {             \
+               kgnilnd_destroy_conn(conn);                             \
+       }                                                               \
+} while (0)
+
+static inline struct list_head *
+kgnilnd_nid2peerlist(lnet_nid_t nid)
+{
+       unsigned int hash = ((unsigned int)LNET_NIDADDR(nid)) % *kgnilnd_tunables.kgn_peer_hash_size;
+
+       RETURN(&kgnilnd_data.kgn_peers[hash]);
+}
+
+static inline struct list_head *
+kgnilnd_netnum2netlist(__u16 netnum)
+{
+       unsigned int hash = ((unsigned int) netnum) % *kgnilnd_tunables.kgn_net_hash_size;
+
+       RETURN(&kgnilnd_data.kgn_nets[hash]);
+}
+
+static inline int
+kgnilnd_peer_active(kgn_peer_t *peer)
+{
+       /* Am I in the peer hash table? */
+       return (!list_empty(&peer->gnp_list));
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+static inline int
+kgnilnd_can_unlink_peer_locked(kgn_peer_t *peer)
+{
+       CDEBUG(D_NET, "peer 0x%p->%s conns? %d tx? %d\n",
+               peer, libcfs_nid2str(peer->gnp_nid),
+               !list_empty(&peer->gnp_conns),
+               !list_empty(&peer->gnp_tx_queue));
+
+       /* kgn_peer_conn_lock protects us from conflict with
+        * kgnilnd_peer_notify and gnp_persistent */
+       RETURN ((list_empty(&peer->gnp_conns)) &&
+               (list_empty(&peer->gnp_tx_queue)));
+}
+
+/* returns positive if error was for a clean shutdown of conn */
+static inline int
+kgnilnd_conn_clean_errno(int errno)
+{
+       /*  - ESHUTDOWN - LND is unloading
+        *  - EUCLEAN - admin requested via "lctl del_peer"
+        *  - ENETRESET - admin requested via "lctl disconnect"
+        *  - ENOTRECOVERABLE - stack reset
+        *  - EISCONN - cleared via "lctl push"
+        *  not doing ESTALE - that isn't clean */
+       RETURN ((errno == 0) ||
+               (errno == -ESHUTDOWN) ||
+               (errno == -EUCLEAN) ||
+               (errno == -ENETRESET) ||
+               (errno == -EISCONN) ||
+               (errno == -ENOTRECOVERABLE));
+}
+
+/* returns positive if error results in purgatory hold */
+static inline int
+kgnilnd_check_purgatory_errno(int errno)
+{
+       /* We don't want to save the purgatory lists these cases:
+        *  - EUCLEAN - admin requested via "lctl del_peer"
+        *  - ESHUTDOWN - LND is unloading
+        */
+       RETURN ((errno != -ESHUTDOWN) &&
+               (errno != -EUCLEAN));
+
+}
+
+/* returns positive if a purgatory hold is needed */
+static inline int
+kgnilnd_check_purgatory_conn(kgn_conn_t *conn)
+{
+       int loopback = 0;
+
+       if (conn->gnc_peer) {
+               loopback = conn->gnc_peer->gnp_nid ==
+                      conn->gnc_peer->gnp_net->gnn_ni->ni_nid;
+       } else {
+               /* short circuit - a conn that didn't complete
+                * setup never needs a purgatory hold */
+               RETURN(0);
+       }
+       CDEBUG(D_NETTRACE, "conn 0x%p->%s loopback %d close_recvd %d\n",
+               conn, conn->gnc_peer ?
+                               libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+                               "<?>",
+               loopback, conn->gnc_close_recvd);
+
+       /* we only use a purgatory hold if we've not received the CLOSE msg
+        * from our peer - without that message, we can't know the state of
+        * the other end of this connection and must put it into purgatory
+        * to prevent reuse and corruption.
+        * The theory is that a TX error can be communicated in all other cases
+        */
+       RETURN(likely(!loopback) && !conn->gnc_close_recvd &&
+               kgnilnd_check_purgatory_errno(conn->gnc_error));
+}
+
+static inline const char *
+kgnilnd_tx_state2str(kgn_tx_list_state_t state);
+
+static inline struct list_head *
+kgnilnd_tx_state2list(kgn_peer_t *peer, kgn_conn_t *conn,
+                       kgn_tx_list_state_t to_state)
+{
+       switch (to_state) {
+       case GNILND_TX_PEERQ:
+               return &peer->gnp_tx_queue;
+       case GNILND_TX_FMAQ:
+               return &conn->gnc_fmaq;
+       case GNILND_TX_LIVE_FMAQ:
+       case GNILND_TX_LIVE_RDMAQ:
+       case GNILND_TX_DYING:
+               return NULL;
+       case GNILND_TX_MAPQ:
+               return &conn->gnc_device->gnd_map_tx;
+       case GNILND_TX_RDMAQ:
+               return &conn->gnc_device->gnd_rdmaq;
+       default:
+               /* IDLE, FREED or ALLOCD is not valid "on list" state */
+               CERROR("invalid state requested: %s\n",
+                       kgnilnd_tx_state2str(to_state));
+               LBUG();
+               break;
+       }
+}
+
+/* should hold tx, conn or peer lock when calling */
+static inline void
+kgnilnd_tx_add_state_locked(kgn_tx_t *tx, kgn_peer_t *peer,
+                       kgn_conn_t *conn, kgn_tx_list_state_t state,
+                       int add_tail)
+{
+       struct list_head        *list = NULL;
+
+       /* make sure we have a sane TX state to start */
+       GNITX_ASSERTF(tx, (tx->tx_list_p == NULL &&
+                 tx->tx_list_state == GNILND_TX_ALLOCD) &&
+               list_empty(&tx->tx_list),
+               "bad state with tx_list %s",
+               list_empty(&tx->tx_list) ? "empty" : "not empty");
+
+       /* WTF - you are already on that state buttmunch */
+       GNITX_ASSERTF(tx, state != tx->tx_list_state,
+                     "already at %s", kgnilnd_tx_state2str(state));
+
+       /* get proper list from the state requested */
+       list = kgnilnd_tx_state2list(peer, conn, state);
+
+       /* add refcount */
+       switch (state) {
+       case GNILND_TX_PEERQ:
+               kgnilnd_peer_addref(peer);
+               break;
+       case GNILND_TX_ALLOCD:
+               /* no refs needed */
+               break;
+       case GNILND_TX_FMAQ:
+               kgnilnd_conn_addref(conn);
+               break;
+       case GNILND_TX_MAPQ:
+               atomic_inc(&conn->gnc_device->gnd_nq_map);
+               kgnilnd_conn_addref(conn);
+               break;
+       case GNILND_TX_LIVE_FMAQ:
+               atomic_inc(&conn->gnc_nlive_fma);
+               kgnilnd_conn_addref(conn);
+               break;
+       case GNILND_TX_LIVE_RDMAQ:
+               atomic_inc(&conn->gnc_nlive_rdma);
+               kgnilnd_conn_addref(conn);
+               break;
+       case GNILND_TX_RDMAQ:
+               atomic_inc(&conn->gnc_nq_rdma);
+               kgnilnd_conn_addref(conn);
+               break;
+       case GNILND_TX_DYING:
+               kgnilnd_conn_addref(conn);
+               break;
+       default:
+               CERROR("invalid state requested: %s\n",
+                       kgnilnd_tx_state2str(state));
+               LBUG();
+               break;;
+       }
+
+       /* if this changes, change kgnilnd_alloc_tx */
+       tx->tx_list_state = state;
+
+       /* some states don't have lists - we track them in the per conn
+        * TX table instead. Waste not, want not! */
+       if (list != NULL) {
+               tx->tx_list_p = list;
+               if (add_tail)
+                       list_add_tail(&tx->tx_list, list);
+               else
+                       list_add(&tx->tx_list, list);
+       } else {
+               /* set dummy list_p to make book keeping happy and let debugging
+                * be a hair easier */
+               tx->tx_list_p = (void *)state;
+       }
+
+       GNIDBG_TX(D_NET, tx, "onto %s->0x%p",
+                 kgnilnd_tx_state2str(state), list);
+}
+
+static inline void
+kgnilnd_tx_del_state_locked(kgn_tx_t *tx, kgn_peer_t *peer,
+                       kgn_conn_t *conn, kgn_tx_list_state_t new_state)
+{
+       /* These is only 1 "off-list" state */
+       GNITX_ASSERTF(tx, new_state == GNILND_TX_ALLOCD,
+                     "invalid new_state %s", kgnilnd_tx_state2str(new_state));
+
+       /* new_state == ALLOCD means we are deallocating this tx,
+        * so make sure it was on a valid list to start with */
+       GNITX_ASSERTF(tx, (tx->tx_list_p != NULL) &&
+                     (((tx->tx_list_state == GNILND_TX_LIVE_FMAQ) ||
+                       (tx->tx_list_state == GNILND_TX_LIVE_RDMAQ) ||
+                       (tx->tx_list_state == GNILND_TX_DYING)) == list_empty(&tx->tx_list)),
+                     "bad state", NULL);
+
+       GNIDBG_TX(D_NET, tx, "off %p", tx->tx_list_p);
+
+       /* drop refcount */
+       switch (tx->tx_list_state) {
+       case GNILND_TX_PEERQ:
+               kgnilnd_peer_decref(peer);
+               break;
+       case GNILND_TX_FREED:
+       case GNILND_TX_IDLE:
+       case GNILND_TX_ALLOCD:
+               /* no refs needed */
+               break;
+       case GNILND_TX_DYING:
+               kgnilnd_conn_decref(conn);
+               break;
+       case GNILND_TX_FMAQ:
+               kgnilnd_conn_decref(conn);
+               break;
+       case GNILND_TX_MAPQ:
+               atomic_dec(&conn->gnc_device->gnd_nq_map);
+               kgnilnd_conn_decref(conn);
+               break;
+       case GNILND_TX_LIVE_FMAQ:
+               atomic_dec(&conn->gnc_nlive_fma);
+               kgnilnd_conn_decref(conn);
+               break;
+       case GNILND_TX_LIVE_RDMAQ:
+               atomic_dec(&conn->gnc_nlive_rdma);
+               kgnilnd_conn_decref(conn);
+               break;
+       case GNILND_TX_RDMAQ:
+               atomic_dec(&conn->gnc_nq_rdma);
+               kgnilnd_conn_decref(conn);
+       /* don't need to assert on default, already did in set */
+       }
+
+       /* for ALLOCD, this might already be true, but no harm doing it again */
+       list_del_init(&tx->tx_list);
+       tx->tx_list_p = NULL;
+       tx->tx_list_state = new_state;
+}
+
+static inline int
+kgnilnd_tx_mapped(kgn_tx_t *tx)
+{
+       return (tx->tx_buftype == GNILND_BUF_VIRT_MAPPED ||
+               tx->tx_buftype == GNILND_BUF_PHYS_MAPPED);
+}
+
+static inline struct list_head *
+kgnilnd_cqid2connlist(__u32 cqid)
+{
+       unsigned int hash = cqid % *kgnilnd_tunables.kgn_peer_hash_size;
+
+       return (&kgnilnd_data.kgn_conns [hash]);
+}
+
+static inline kgn_conn_t *
+kgnilnd_cqid2conn_locked(__u32 cqid)
+{
+       struct list_head *conns = kgnilnd_cqid2connlist(cqid);
+       struct list_head *tmp;
+       kgn_conn_t       *conn;
+
+       list_for_each(tmp, conns) {
+               conn = list_entry(tmp, kgn_conn_t, gnc_hashlist);
+
+               if (conn->gnc_cqid == cqid)
+                       return conn;
+       }
+
+       return NULL;
+}
+
+/* returns 1..GNILND_MAX_CQID on success, 0 on failure */
+static inline __u32
+kgnilnd_get_cqid_locked(void)
+{
+       int     looped = 0;
+       __u32   cqid;
+
+       do {
+               cqid = kgnilnd_data.kgn_next_cqid++;
+               if (kgnilnd_data.kgn_next_cqid >= GNILND_MAX_CQID) {
+                       if (looped) {
+                               return 0;
+                       }
+                       kgnilnd_data.kgn_next_cqid = 1;
+                       looped = 1;
+               }
+       } while (kgnilnd_cqid2conn_locked(cqid) != NULL);
+
+       return cqid;
+}
+
+static inline void
+kgnilnd_validate_tx_ev_id(kgn_tx_ev_id_t *ev_id, kgn_tx_t **txp, kgn_conn_t **connp)
+{
+       kgn_tx_t        *tx = NULL;
+       kgn_conn_t      *conn = NULL;
+
+       /* set to NULL so any early return is an error */
+       *txp = NULL;
+       *connp = NULL;
+
+       LASSERTF((ev_id->txe_idx > 0) &&
+                (ev_id->txe_idx < GNILND_MAX_MSG_ID),
+               "bogus txe_idx %d >= %d\n",
+               ev_id->txe_idx, GNILND_MAX_MSG_ID);
+
+       LASSERTF((ev_id->txe_cqid > 0) &&
+                (ev_id->txe_cqid < GNILND_MAX_CQID),
+               "bogus txe_cqid %d >= %d\n",
+               ev_id->txe_cqid, GNILND_MAX_CQID);
+
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       conn = kgnilnd_cqid2conn_locked(ev_id->txe_cqid);
+
+       if (conn == NULL) {
+               /* Conn was destroyed? */
+               read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+               CDEBUG(D_NET, "CQID %d lookup failed\n", ev_id->txe_cqid);
+               return;
+       }
+       /* just insurance */
+       kgnilnd_conn_addref(conn);
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* we know this is safe - as the TX won't be reused until AFTER
+        * the conn is unlinked from the cqid hash, so we can use the TX
+        * (serializing to avoid any cache oddness) freely from the conn tx ref table */
+
+       spin_lock(&conn->gnc_tx_lock);
+       tx = conn->gnc_tx_ref_table[ev_id->txe_idx];
+       spin_unlock(&conn->gnc_tx_lock);
+
+       /* We could have a tx that was cleared out by other forces
+        * lctl disconnect or del_peer. */
+       if (tx == NULL) {
+               CNETERR("txe_idx %d is gone, ignoring event\n", ev_id->txe_idx);
+               kgnilnd_conn_decref(conn);
+               return;
+       }
+
+       /* check tx->tx_msg magic to make sure kgni didn't eat it */
+       GNITX_ASSERTF(tx, tx->tx_msg.gnm_magic == GNILND_MSG_MAGIC,
+                     "came back from kgni with bad magic %x", tx->tx_msg.gnm_magic);
+
+       GNITX_ASSERTF(tx, tx->tx_id.txe_idx == ev_id->txe_idx,
+                     "conn 0x%p->%s tx_ref_table hosed: wanted txe_idx %d "
+                     "found tx %p txe_idx %d",
+                     conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                     ev_id->txe_idx, tx, tx->tx_id.txe_idx);
+
+       GNITX_ASSERTF(tx, tx->tx_conn != NULL, "tx with NULL connection", NULL);
+
+       GNITX_ASSERTF(tx, tx->tx_conn == conn, "tx conn does not equal conn", NULL);
+
+       *txp = tx;
+       *connp = conn;
+
+       GNIDBG_TX(D_NET, tx, "validated to 0x%p", conn);
+}
+
+/* set_normalized_timepsec isn't exported from the kernel, so
+ * we need to do the same thing inline */
+static inline struct timespec
+kgnilnd_ts_sub(struct timespec lhs, struct timespec rhs)
+{
+       time_t                  sec;
+       long                    nsec;
+       struct timespec         ts;
+
+       sec = lhs.tv_sec - rhs.tv_sec;
+       nsec = lhs.tv_nsec - rhs.tv_nsec;
+
+       while (nsec >= NSEC_PER_SEC) {
+               nsec -= NSEC_PER_SEC;
+               ++sec;
+       }
+       while (nsec < 0) {
+               nsec += NSEC_PER_SEC;
+               --sec;
+       }
+       ts.tv_sec = sec;
+       ts.tv_nsec = nsec;
+       return ts;
+}
+
+static inline int
+kgnilnd_count_list(struct list_head *q)
+{
+       struct list_head *e;
+       int               n = 0;
+
+       list_for_each(e, q) {
+               n++;
+       }
+
+       return n;
+}
+
+/* kgnilnd_find_net adds a reference to the net it finds
+ * this is so the net will not be removed before the calling function
+ * has time to use the data returned. This reference needs to be released
+ * by the calling function once it has finished using the returned net
+ */
+
+static inline int
+kgnilnd_find_net(lnet_nid_t nid, kgn_net_t **netp)
+{
+       kgn_net_t *net;
+       int rc;
+
+       rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+
+       if (!rc) {
+               return -ESHUTDOWN;
+       }
+
+       list_for_each_entry(net, kgnilnd_netnum2netlist(LNET_NETNUM(LNET_NIDNET(nid))), gnn_list) {
+               if (!net->gnn_shutdown && LNET_NIDNET(net->gnn_ni->ni_nid) == LNET_NIDNET(nid)) {
+                       kgnilnd_net_addref(net);
+                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                       *netp = net;
+                       return 0;
+               }
+       }
+
+       up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+       return -ENONET;
+}
+
+#ifdef CONFIG_DEBUG_SLAB
+#define KGNILND_POISON(ptr, c, s) do {} while(0)
+#else
+#define KGNILND_POISON(ptr, c, s) memset(ptr, c, s)
+#endif
+
+int kgnilnd_dev_init(kgn_device_t *dev);
+void kgnilnd_dev_fini(kgn_device_t *dev);
+int kgnilnd_startup(lnet_ni_t *ni);
+void kgnilnd_shutdown(lnet_ni_t *ni);
+int kgnilnd_base_startup(void);
+void kgnilnd_base_shutdown(void);
+
+int kgnilnd_allocate_phys_fmablk(kgn_device_t *device);
+int kgnilnd_map_phys_fmablk(kgn_device_t *device);
+void kgnilnd_unmap_phys_fmablk(kgn_device_t *device);
+void kgnilnd_free_phys_fmablk(kgn_device_t *device);
+
+int kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when);
+int kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int kgnilnd_eager_recv(lnet_ni_t *ni, void *private,
+                       lnet_msg_t *lntmsg, void **new_private);
+int kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+               int delayed, unsigned int niov,
+               struct iovec *iov, lnet_kiov_t *kiov,
+               unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+__u16 kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob, int dump_blob);
+
+/* purgatory functions */
+void kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer);
+void kgnilnd_mark_for_detach_purgatory_all_locked(kgn_peer_t *peer);
+void kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list);
+void kgnilnd_release_purgatory_list(struct list_head *conn_list);
+
+void kgnilnd_update_reaper_timeout(long timeout);
+void kgnilnd_unmap_buffer(kgn_tx_t *tx, int error);
+kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source);
+void kgnilnd_tx_done(kgn_tx_t *tx, int completion);
+void kgnilnd_txlist_done(struct list_head *txlist, int error);
+void kgnilnd_unlink_peer_locked(kgn_peer_t *peer);
+void kgnilnd_schedule_conn(kgn_conn_t *conn);
+void kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent);
+
+void kgnilnd_schedule_dgram(kgn_device_t *dev);
+int kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net);
+void kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp);
+int kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp);
+
+kgn_peer_t *kgnilnd_find_peer_locked(lnet_nid_t nid);
+int kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command, int error);
+void kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer);
+void kgnilnd_queue_reply(kgn_conn_t *conn, kgn_tx_t *tx);
+void kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx);
+void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target);
+int kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full);
+void kgnilnd_consume_rx(kgn_rx_t *rx);
+
+void kgnilnd_schedule_device(kgn_device_t *dev);
+void kgnilnd_device_callback(__u32 devid, __u64 arg);
+void kgnilnd_schedule_device_timer(unsigned long arg);
+
+int kgnilnd_reaper(void *arg);
+int kgnilnd_scheduler(void *arg);
+int kgnilnd_dgram_mover(void *arg);
+
+int kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev);
+int kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
+kgn_conn_t *kgnilnd_find_conn_locked(kgn_peer_t *peer);
+int kgnilnd_get_conn(kgn_conn_t **connp, kgn_peer_t);
+kgn_conn_t *kgnilnd_find_or_create_conn_locked(kgn_peer_t *peer);
+void kgnilnd_peer_cancel_tx_queue(kgn_peer_t *peer);
+void kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies);
+int kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
+void kgnilnd_peer_alive(kgn_peer_t *peer);
+void kgnilnd_peer_notify(kgn_peer_t *peer, int error);
+void kgnilnd_close_conn_locked(kgn_conn_t *conn, int error);
+void kgnilnd_close_conn(kgn_conn_t *conn, int error);
+void kgnilnd_complete_closed_conn(kgn_conn_t *conn);
+void kgnilnd_destroy_conn_ep(kgn_conn_t *conn);
+
+int kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why);
+
+int kgnilnd_tunables_init(void);
+void kgnilnd_tunables_fini(void);
+void kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source);
+
+void kgnilnd_bump_timeouts(__u32 nap_time, char *reason);
+void kgnilnd_pause_threads(void);
+int kgnilnd_hw_in_quiesce(void);
+int kgnilnd_check_hw_quiesce(void);
+void kgnilnd_quiesce_wait(char *reason);
+void kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs);
+int kgnilnd_ruhroh_thread(void *arg);
+void kgnilnd_reset_stack(void);
+void kgnilnd_critical_error(gni_err_handle_t err_handle);
+
+void kgnilnd_insert_sysctl(void);
+void kgnilnd_remove_sysctl(void);
+void kgnilnd_proc_init(void);
+void kgnilnd_proc_fini(void);
+
+/* gnilnd_conn.c */
+void kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold);
+
+int kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid);
+void kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram);
+void kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram);
+
+int kgnilnd_setup_wildcard_dgram(kgn_device_t *dev);
+int kgnilnd_cancel_net_dgrams(kgn_net_t *net);
+int kgnilnd_cancel_wc_dgrams(kgn_device_t *dev);
+void kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev);
+
+int kgnilnd_dgram_waitq(void *arg);
+
+int kgnilnd_set_conn_params(kgn_dgram_t *dgram);
+
+/* struct2str functions - we don't use a default: case to cause the compile
+ * to fail if there is a missing case. This allows us to hide these down here
+ * out of the way but ensure we'll catch any updates to the enum/types
+ * above */
+
+#define DO_TYPE(x) case x: return #x;
+static inline const char *
+kgnilnd_fmablk_state2str(kgn_fmablk_state_t state)
+{
+       /* Only want single char string for this */
+       switch (state) {
+       case GNILND_FMABLK_IDLE:
+               return "I";
+       case GNILND_FMABLK_PHYS:
+               return "P";
+       case GNILND_FMABLK_VIRT:
+               return "V";
+       case GNILND_FMABLK_FREED:
+               return "F";
+       }
+       return "<unknown state>";
+}
+
+static inline const char *
+kgnilnd_msgtype2str(int type)
+{
+       switch (type) {
+               DO_TYPE(GNILND_MSG_NONE);
+               DO_TYPE(GNILND_MSG_NOOP);
+               DO_TYPE(GNILND_MSG_IMMEDIATE);
+               DO_TYPE(GNILND_MSG_PUT_REQ);
+               DO_TYPE(GNILND_MSG_PUT_NAK);
+               DO_TYPE(GNILND_MSG_PUT_ACK);
+               DO_TYPE(GNILND_MSG_PUT_DONE);
+               DO_TYPE(GNILND_MSG_GET_REQ);
+               DO_TYPE(GNILND_MSG_GET_NAK);
+               DO_TYPE(GNILND_MSG_GET_DONE);
+               DO_TYPE(GNILND_MSG_CLOSE);
+       }
+       return "<unknown msg type>";
+}
+
+static inline const char *
+kgnilnd_tx_state2str(kgn_tx_list_state_t state)
+{
+       switch (state) {
+               DO_TYPE(GNILND_TX_IDLE);
+               DO_TYPE(GNILND_TX_ALLOCD);
+               DO_TYPE(GNILND_TX_PEERQ);
+               DO_TYPE(GNILND_TX_MAPQ);
+               DO_TYPE(GNILND_TX_FMAQ);
+               DO_TYPE(GNILND_TX_LIVE_FMAQ);
+               DO_TYPE(GNILND_TX_RDMAQ);
+               DO_TYPE(GNILND_TX_LIVE_RDMAQ);
+               DO_TYPE(GNILND_TX_DYING);
+               DO_TYPE(GNILND_TX_FREED);
+       }
+       return "<unknown state>";
+}
+
+static inline const char *
+kgnilnd_conn_state2str(kgn_conn_t *conn)
+{
+       kgn_conn_state_t state = conn->gnc_state;
+       switch (state) {
+               DO_TYPE(GNILND_CONN_DUMMY);
+               DO_TYPE(GNILND_CONN_LISTEN);
+               DO_TYPE(GNILND_CONN_CONNECTING);
+               DO_TYPE(GNILND_CONN_ESTABLISHED);
+               DO_TYPE(GNILND_CONN_CLOSING);
+               DO_TYPE(GNILND_CONN_CLOSED);
+               DO_TYPE(GNILND_CONN_DONE);
+               DO_TYPE(GNILND_CONN_DESTROY_EP);
+       }
+       return "<?state?>";
+}
+
+static inline const char *
+kgnilnd_connreq_type2str(kgn_connreq_t *connreq)
+{
+       kgn_connreq_type_t type = connreq->gncr_type;
+
+       switch (type) {
+               DO_TYPE(GNILND_CONNREQ_REQ);
+               DO_TYPE(GNILND_CONNREQ_NAK);
+               DO_TYPE(GNILND_CONNREQ_CLOSE);
+       }
+       return "<?type?>";
+}
+
+static inline const char *
+kgnilnd_dgram_state2str(kgn_dgram_t *dgram)
+{
+       kgn_dgram_state_t state = dgram->gndg_state;
+
+       switch (state) {
+               DO_TYPE(GNILND_DGRAM_USED);
+               DO_TYPE(GNILND_DGRAM_POSTING);
+               DO_TYPE(GNILND_DGRAM_POSTED);
+               DO_TYPE(GNILND_DGRAM_PROCESSING);
+               DO_TYPE(GNILND_DGRAM_DONE);
+               DO_TYPE(GNILND_DGRAM_CANCELED);
+       }
+       return "<?state?>";
+}
+
+static inline const char *
+kgnilnd_dgram_type2str(kgn_dgram_t *dgram)
+{
+       kgn_dgram_type_t type = dgram->gndg_type;
+
+       switch (type) {
+               DO_TYPE(GNILND_DGRAM_REQ);
+               DO_TYPE(GNILND_DGRAM_WC_REQ);
+               DO_TYPE(GNILND_DGRAM_NAK);
+               DO_TYPE(GNILND_DGRAM_CLOSE);
+       }
+       return "<?type?>";
+}
+
+
+#undef DO_TYPE
+
+/* API wrapper functions - include late to pick up all of the other defines */
+#include "gnilnd_api_wrap.h"
+
+#endif /* _GNILND_GNILND_H_ */
diff --git a/lnet/klnds/gnilnd/gnilnd_api_wrap.h b/lnet/klnds/gnilnd/gnilnd_api_wrap.h
new file mode 100644 (file)
index 0000000..e7ba9ab
--- /dev/null
@@ -0,0 +1,1505 @@
+/*
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_API_WRAP_H
+#define _GNILND_API_WRAP_H
+
+/* LNet is allocated failure locations 0xe000 to 0xffff */
+
+/* GNILND has 0xf0XX */
+#define CFS_FAIL_GNI                   0xf000
+#define CFS_FAIL_GNI_PHYS_MAP          0xf001
+#define CFS_FAIL_GNI_VIRT_MAP          0xf002
+#define CFS_FAIL_GNI_GET_UNMAP         0xf003
+#define CFS_FAIL_GNI_PUT_UNMAP         0xf004
+#define CFS_FAIL_GNI_MAP_TX            0xf005
+#define CFS_FAIL_GNI_SMSG_SEND         0xf006
+#define CFS_FAIL_GNI_CLOSE_SEND                0xf007
+#define CFS_FAIL_GNI_CDM_CREATE                0xf008
+#define CFS_FAIL_GNI_CDM_DESTROY       0xf009
+#define CFS_FAIL_GNI_CDM_ATTACH                0xf00a
+#define CFS_FAIL_GNI_CQ_CREATE         0xf00b
+#define CFS_FAIL_GNI_CQ_DESTROY                0xf00c
+#define CFS_FAIL_GNI_EP_BIND           0xf00d
+#define CFS_FAIL_GNI_EP_UNBIND         0xf00e
+#define CFS_FAIL_GNI_EP_SET_EVDATA     0xf00f
+#define CFS_FAIL_GNI_SMSG_INIT         0xf010
+#define CFS_FAIL_GNI_SMSG_RELEASE      0xf011
+#define CFS_FAIL_GNI_POST_RDMA         0xf012
+#define CFS_FAIL_GNI_GET_COMPLETED     0xf013
+#define CFS_FAIL_GNI_EP_DESTROY                0xf015
+#define CFS_FAIL_GNI_VIRT_UNMAP                0xf016
+#define CFS_FAIL_GNI_MDD_RELEASE       0xf017
+#define CFS_FAIL_GNI_NOOP_SEND         0xf018
+#define CFS_FAIL_GNI_ERR_SUBSCRIBE     0xf01a
+#define CFS_FAIL_GNI_QUIESCE_RACE      0xf01b
+#define CFS_FAIL_GNI_DG_TERMINATE      0xf01c
+#define CFS_FAIL_GNI_REG_QUIESCE       0xf01d
+#define CFS_FAIL_GNI_IN_QUIESCE                0xf01e
+#define CFS_FAIL_GNI_DELAY_RDMA                0xf01f
+#define CFS_FAIL_GNI_SR_DOWN_RACE      0xf020
+#define CFS_FAIL_GNI_ALLOC_TX          0xf021
+#define CFS_FAIL_GNI_FMABLK_AVAIL      0xf022
+#define CFS_FAIL_GNI_EP_CREATE         0xf023
+#define CFS_FAIL_GNI_CQ_GET_EVENT      0xf024
+#define CFS_FAIL_GNI_PROBE             0xf025
+#define CFS_FAIL_GNI_EP_TEST           0xf026
+#define CFS_FAIL_GNI_CONNREQ_DROP      0xf027
+#define CFS_FAIL_GNI_CONNREQ_PROTO     0xf028
+#define CFS_FAIL_GNI_CONND_PILEUP      0xf029
+#define CFS_FAIL_GNI_PHYS_SETUP                0xf02a
+#define CFS_FAIL_GNI_FIND_TARGET       0xf02b
+#define CFS_FAIL_GNI_WC_DGRAM_FREE     0xf02c
+#define CFS_FAIL_GNI_DROP_CLOSING      0xf02d
+#define CFS_FAIL_GNI_RX_CLOSE_CLOSING  0xf02e
+#define CFS_FAIL_GNI_RX_CLOSE_CLOSED   0xf02f
+#define CFS_FAIL_GNI_EP_POST           0xf030
+#define CFS_FAIL_GNI_PACK_SRCNID       0xf031
+#define CFS_FAIL_GNI_PACK_DSTNID       0xf032
+#define CFS_FAIL_GNI_PROBE_WAIT                0xf033
+#define CFS_FAIL_GNI_SMSG_CKSUM1       0xf034
+#define CFS_FAIL_GNI_SMSG_CKSUM2       0xf035
+#define CFS_FAIL_GNI_SMSG_CKSUM3       0xf036
+#define CFS_FAIL_GNI_DROP_DESTROY_EP   0xf037
+#define CFS_FAIL_GNI_SMSG_GETNEXT      0xf038
+#define CFS_FAIL_GNI_FINISH_PURG       0xf039
+#define CFS_FAIL_GNI_PURG_REL_DELAY    0xf03a
+#define CFS_FAIL_GNI_DONT_NOTIFY       0xf03b
+#define CFS_FAIL_GNI_VIRT_SMALL_MAP    0xf03c
+#define CFS_FAIL_GNI_DELAY_RDMAQ       0xf03d
+#define CFS_FAIL_GNI_PAUSE_SHUTDOWN    0xf03e
+#define CFS_FAIL_GNI_PAUSE_DGRAM_COMP  0xf03f
+#define CFS_FAIL_GNI_NET_LOOKUP                0xf040
+#define CFS_FAIL_GNI_RECV_TIMEOUT      0xf041
+#define CFS_FAIL_GNI_SEND_TIMEOUT      0xf042
+#define CFS_FAIL_GNI_ONLY_NOOP         0xf043
+#define CFS_FAIL_GNI_FINISH_PURG2      0xf044
+#define CFS_FAIL_GNI_RACE_RESET                0xf045
+#define CFS_FAIL_GNI_GNP_CONNECTING1   0xf046
+#define CFS_FAIL_GNI_GNP_CONNECTING2   0xf047
+#define CFS_FAIL_GNI_GNP_CONNECTING3   0xf048
+#define CFS_FAIL_GNI_PUT_ACK_AGAIN     0xf050
+#define CFS_FAIL_GNI_GET_REQ_AGAIN     0xf051
+
+/* helper macros */
+extern void
+_kgnilnd_api_rc_lbug(const char *rcstr, int rc, struct libcfs_debug_msg_data *data,
+                       const char *fmt, ...)
+       __attribute__ ((format (printf, 4, 5)));
+
+#define kgnilnd_api_rc_lbug(msgdata, rc, fmt, a...)                            \
+do {                                                                           \
+       CFS_CHECK_STACK(msgdata, D_ERROR, NULL);                                \
+       /* we don't mask this - it is always at D_ERROR */                      \
+       _kgnilnd_api_rc_lbug(kgnilnd_api_rc2str(rc), (rc), msgdata, fmt, ##a);  \
+} while (0)
+
+#define DO_RETCODE(x) case x: return #x;
+static inline const char *
+kgnilnd_api_rc2str(gni_return_t rrc)
+{
+
+       switch (rrc) {
+               DO_RETCODE(GNI_RC_SUCCESS)
+               DO_RETCODE(GNI_RC_NOT_DONE);
+               DO_RETCODE(GNI_RC_INVALID_PARAM);
+               DO_RETCODE(GNI_RC_ERROR_RESOURCE);
+               DO_RETCODE(GNI_RC_TIMEOUT);
+               DO_RETCODE(GNI_RC_PERMISSION_ERROR);
+               DO_RETCODE(GNI_RC_DESCRIPTOR_ERROR);
+               DO_RETCODE(GNI_RC_ALIGNMENT_ERROR);
+               DO_RETCODE(GNI_RC_INVALID_STATE);
+               DO_RETCODE(GNI_RC_NO_MATCH);
+               DO_RETCODE(GNI_RC_SIZE_ERROR);
+               DO_RETCODE(GNI_RC_TRANSACTION_ERROR);
+               DO_RETCODE(GNI_RC_ILLEGAL_OP);
+               DO_RETCODE(GNI_RC_ERROR_NOMEM);
+       }
+       LBUG();
+}
+#undef DO_RETCODE
+
+/* log an error and LBUG for unhandled rc from gni api function
+ * the fmt should be something like:
+ *  gni_api_call(arg1, arg2, arg3)
+ */
+
+/* apick_fn and apick_fmt should be defined for each site */
+#undef apick_fn
+#undef apick_fmt
+
+#define GNILND_API_RC_LBUG(args...)                                            \
+do {                                                                           \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);                     \
+       kgnilnd_api_rc_lbug(&msgdata, rrc, apick_fn"("apick_fmt")", ##args);    \
+} while (0)
+
+#define GNILND_API_SWBUG(args...)                                               \
+do {                                                                            \
+       CERROR("likely SOFTWARE BUG "apick_fn"("apick_fmt") rc %s\n",           \
+                ##args, kgnilnd_api_rc2str(rrc));                              \
+} while (0)
+
+#define GNILND_API_EINVAL(args...)                                              \
+do {                                                                            \
+       CERROR("invalid parameter to "apick_fn"("apick_fmt") rc %s\n",          \
+                ##args, kgnilnd_api_rc2str(rrc));                              \
+} while (0)
+
+#define GNILND_API_RESOURCE(args...)                                            \
+do {                                                                            \
+       CERROR("no resources for "apick_fn"("apick_fmt") rc %s\n",              \
+               ##args, kgnilnd_api_rc2str(rrc));                               \
+} while (0)
+
+#define GNILND_API_BUSY(args...)                                                \
+do {                                                                            \
+       CERROR("resources busy for "apick_fn"("apick_fmt") rc %s\n",            \
+               ##args, kgnilnd_api_rc2str(rrc));                               \
+} while (0)
+
+#undef DEBUG_SMSG_CREDITS
+#ifdef DEBUG_SMSG_CREDITS
+#define CRAY_CONFIG_GHAL_GEMINI
+#include <gni_priv.h>
+#define GNIDBG_SMSG_CREDS(level, conn)                                        \
+do {                                                                          \
+       gni_ep_smsg_mbox_t *smsg = conn->gnc_ephandle->smsg;                  \
+       CDEBUG(level, "SMSGDBG: conn %p mcred %d/%d bcred %d/%d "             \
+               "s_seq %d/%d/%d r_seq %d/%d/%d retr %d\n",                    \
+               conn, smsg->mbox_credits, smsg->back_mbox_credits,            \
+               smsg->buffer_credits, smsg->back_buffer_credits,              \
+               smsg->s_seqno, smsg->s_seqno_back_mbox_credits,               \
+               smsg->s_seqno_back_buffer_credits, smsg->r_seqno,             \
+               smsg->r_seqno_back_mbox_credits,                              \
+               smsg->r_seqno_back_buffer_credits, smsg->retransmit_count);   \
+} while (0)
+#else
+#define GNIDBG_SMSG_CREDS(level, conn) do {} while(0)
+#endif
+
+/* these are all wrappers around gni_XXX functions.
+ * This allows us to handle all the return codes and api checks without
+ * dirtying up the logic code */
+
+/* TODO: RETURN wrapper that translates integer to GNI API RC string */
+
+#define apick_fn "kgnilnd_cdm_create"
+#define apick_fmt "%u, %u, %u, %u, 0x%p"
+static inline gni_return_t kgnilnd_cdm_create(
+               IN uint32_t             inst_id,
+               IN uint8_t              ptag,
+               IN uint32_t             cookie,
+               IN uint32_t             modes,
+               OUT gni_cdm_handle_t    *cdm_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CDM_CREATE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_cdm_create(inst_id, ptag, cookie, modes, cdm_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+       case GNI_RC_INVALID_PARAM:
+               /* Try to bail gracefully */
+               GNILND_API_SWBUG(
+                       inst_id, ptag, cookie, modes, cdm_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       inst_id, ptag, cookie, modes, cdm_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cdm_attach"
+#define apick_fmt "0x%p, %u, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_cdm_attach(
+               IN gni_cdm_handle_t     cdm_hndl,
+               IN uint32_t             device_id,
+               OUT uint32_t            *local_addr,
+               OUT gni_nic_handle_t    *nic_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CDM_ATTACH)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_cdm_attach(cdm_hndl, device_id, local_addr, nic_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_NO_MATCH:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       cdm_hndl, device_id, local_addr, nic_hndl);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+       case GNI_RC_INVALID_STATE:
+               GNILND_API_RESOURCE(
+                       cdm_hndl, device_id, local_addr, nic_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       cdm_hndl, device_id, local_addr, nic_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fmt
+#undef apick_fn
+
+#define apick_fn "kgnilnd_cdm_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_cdm_destroy(
+               IN gni_cdm_handle_t     cdm_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_DESTROY)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_cdm_destroy(
+                       cdm_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       cdm_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       cdm_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_subscribe_errors"
+#define apick_fmt "0x%p,%x,%u,0x%p,0x%p,0x%p"
+static inline gni_return_t kgnilnd_subscribe_errors(
+               IN gni_nic_handle_t  nic_handle,
+               IN gni_error_mask_t  mask,
+               IN uint32_t          EEQ_size,
+               IN void              (*EQ_new_event)(gni_err_handle_t),
+               IN void              (*app_crit_err)(gni_err_handle_t),
+               OUT gni_err_handle_t *err_handle
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ERR_SUBSCRIBE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_subscribe_errors(
+                       nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+                       err_handle);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+                       err_handle);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_RESOURCE(
+                       nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+                       err_handle);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+                       err_handle);
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_release_errors"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_release_errors(
+               IN gni_err_handle_t err_handle
+               )
+{
+       gni_return_t rrc;
+
+       rrc = gni_release_errors(
+                       err_handle);
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+       case GNI_RC_NOT_DONE:
+               GNILND_API_SWBUG(
+                       err_handle);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       err_handle);
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_set_quiesce_callback"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_set_quiesce_callback(
+               IN gni_nic_handle_t  nic_handle,
+               IN void              (*qsce_func)(gni_nic_handle_t, uint64_t msecs)
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_REG_QUIESCE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_set_quiesce_callback(
+                       nic_handle, qsce_func);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_STATE:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_handle, qsce_func);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_handle, qsce_func);
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_get_quiesce_status"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_get_quiesce_status(
+               IN gni_nic_handle_t  nic_handle
+               )
+{
+       uint32_t rrc;
+
+       /* this has weird RC -
+        * 0 - quiesce not in progress
+        * 1 - quiesce is turned on
+       */
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_IN_QUIESCE)) {
+               rrc = 1;
+       } else {
+               rrc = gni_get_quiesce_status(
+                       nic_handle);
+       }
+
+       switch (rrc)  {
+       case 1:
+       case 0:
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_handle);
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_create"
+#define apick_fmt "0x%p, %u, %u, 0x%p, "LPX64", 0x%p"
+static inline gni_return_t kgnilnd_cq_create(
+               IN gni_nic_handle_t     nic_hndl,
+               IN uint32_t             entry_count,
+               IN uint32_t             delay_index,
+               IN gni_cq_event_hndlr_f *event_handler,
+               IN uint64_t             usr_event_data,
+               OUT gni_cq_handle_t     *cq_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_CREATE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_cq_create(
+                      nic_hndl, entry_count, delay_index, event_handler,
+                       usr_event_data, cq_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, entry_count, delay_index, event_handler,
+                       usr_event_data, cq_hndl);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_RESOURCE(
+                       nic_hndl, entry_count, delay_index, event_handler,
+                       usr_event_data, cq_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, entry_count, delay_index, event_handler,
+                       usr_event_data, cq_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_cq_destroy(
+               IN gni_cq_handle_t cq_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_DESTROY)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+
+               rrc = gni_cq_destroy(
+                       cq_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       cq_hndl);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_BUSY(
+                       cq_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       cq_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_get_event"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_cq_get_event(
+               IN gni_cq_handle_t cq_hndl,
+               OUT gni_cq_entry_t *event_data
+               )
+{
+       gni_return_t rrc;
+
+       /* no error injection - CQs are touchy about the data.
+        * where appropriate, we'll do this on the CQs that should be able to
+        * handle the various errors */
+       rrc = gni_cq_get_event(
+                       cq_hndl, event_data);
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+       case GNI_RC_TRANSACTION_ERROR:
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               LASSERTF(GNI_CQ_OVERRUN(*event_data),
+                        "kgni returned ERROR_RESOURCE but cq_hndl 0x%p is not "
+                        "overrun\n", cq_hndl);
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       cq_hndl, event_data);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       cq_hndl, event_data);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       return rrc;
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_init"
+#define apick_fmt "0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_smsg_init(
+               IN gni_ep_handle_t      ep_hndl,
+               IN gni_smsg_attr_t      *local_smsg_attr,
+               IN gni_smsg_attr_t      *remote_smsg_attr
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_INIT)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_RESOURCE;
+       } else {
+               rrc = gni_smsg_init(
+                       ep_hndl, local_smsg_attr, remote_smsg_attr);
+       }
+
+       switch (rrc)  {
+       /* both of these are OK, upper SW needs to handle */
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+       case GNI_RC_INVALID_STATE:
+               GNILND_API_SWBUG(
+                       ep_hndl, local_smsg_attr, remote_smsg_attr);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_RESOURCE(
+                       ep_hndl, local_smsg_attr, remote_smsg_attr);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, local_smsg_attr, remote_smsg_attr);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_send"
+#define apick_fmt "0x%p, 0x%p, %d, 0x%p, %u %u"
+static inline gni_return_t kgnilnd_smsg_send(
+               IN gni_ep_handle_t      ep_hndl,
+               IN void                 *header,
+               IN uint32_t             header_length,
+               IN void                 *data,
+               IN uint32_t             data_length,
+               IN uint32_t             msg_id
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_SEND)) {
+               if (cfs_fail_loc & CFS_FAIL_RAND) {
+                       rrc = GNI_RC_NOT_DONE;
+               } else {
+                       rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+               }
+       } else {
+               rrc = gni_smsg_send(
+                       ep_hndl, header, header_length, data, data_length, msg_id);
+       }
+
+       switch (rrc)  {
+       /* both of these are OK, upper SW needs to handle */
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, header, header_length, data, data_length, msg_id);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_RESOURCE(
+                       ep_hndl, header, header_length, data, data_length, msg_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, header, header_length, data, data_length, msg_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_getnext"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_smsg_getnext(
+               IN gni_ep_handle_t      ep_hndl,
+               OUT void                **header
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_RELEASE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+       } else {
+               rrc = gni_smsg_getnext(
+                       ep_hndl, header);
+       }
+
+       switch (rrc)  {
+       /* both of these are OK, upper SW needs to handle */
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+       case GNI_RC_INVALID_STATE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, header);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, header);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_release"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_smsg_release(
+               IN gni_ep_handle_t      ep_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_RELEASE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_smsg_release(
+                       ep_hndl);
+       }
+
+       switch (rrc)  {
+       /* both of these are OK, upper SW needs to handle */
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_create"
+#define apick_fmt "0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_ep_create(
+               IN gni_nic_handle_t     nic_hndl,
+               IN gni_cq_handle_t      src_cq_hndl,
+               OUT gni_ep_handle_t     *ep_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_CREATE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_NOMEM;
+       } else {
+               rrc = gni_ep_create(
+                       nic_hndl, src_cq_hndl, ep_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, src_cq_hndl, ep_hndl);
+               break;
+       case GNI_RC_ERROR_NOMEM:
+               GNILND_API_RESOURCE(
+                       nic_hndl, src_cq_hndl, ep_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, src_cq_hndl, ep_hndl);
+
+               /* lbug never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_bind"
+#define apick_fmt "0x%p, %x, %x"
+static inline gni_return_t kgnilnd_ep_bind(
+               IN gni_ep_handle_t      ep_hndl,
+               IN uint32_t             remote_addr,
+               IN uint32_t             remote_id
+               )
+{
+       gni_return_t rrc;
+
+       /* error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_BIND)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+       } else {
+               rrc = gni_ep_bind(
+                       ep_hndl, remote_addr, remote_id);
+       }
+
+       switch (rrc)  {
+       /* both of these are ok, upper sw needs to handle */
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, remote_addr, remote_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, remote_addr, remote_id);
+
+               /* lbug never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_set_eventdata"
+#define apick_fmt "0x%p, %x, %x"
+static inline gni_return_t kgnilnd_ep_set_eventdata(
+               IN gni_ep_handle_t      ep_hndl,
+               IN uint32_t             local_event,
+               IN uint32_t             remote_event
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_SET_EVDATA)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_ep_set_eventdata(
+                       ep_hndl, local_event, remote_event);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, local_event, remote_event);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, local_event, remote_event);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_unbind"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_ep_unbind(
+               IN gni_ep_handle_t      ep_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_UNBIND)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+       } else {
+               rrc = gni_ep_unbind(
+                       ep_hndl);
+       }
+
+       switch (rrc)  {
+       /* both of these are OK, upper SW needs to handle */
+       case GNI_RC_NOT_DONE:
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_ep_destroy(
+               IN gni_ep_handle_t      ep_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_DESTROY)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+       } else {
+               rrc = gni_ep_destroy(
+                       ep_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_w_id"
+#define apick_fmt "0x%p, 0x%p, %d, 0x%p, %d, "LPU64""
+static inline gni_return_t kgnilnd_ep_postdata_w_id(
+               IN gni_ep_handle_t ep_hndl,
+               IN void            *in_data,
+               IN uint16_t        data_len,
+               IN void            *out_buf,
+               IN uint16_t        buf_size,
+               IN uint64_t        datagram_id
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_POST)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_SIZE_ERROR;
+       } else {
+               rrc = gni_ep_postdata_w_id(
+                       ep_hndl, in_data, data_len, out_buf, buf_size,
+                       datagram_id);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_ERROR_NOMEM:
+       case GNI_RC_ERROR_RESOURCE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+       case GNI_RC_SIZE_ERROR:
+               GNILND_API_SWBUG(
+                       ep_hndl, in_data, data_len, out_buf, buf_size,
+                       datagram_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, in_data, data_len, out_buf, buf_size,
+                       datagram_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_test_by_id"
+#define apick_fmt "0x%p, "LPU64", 0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_ep_postdata_test_by_id(
+               IN gni_ep_handle_t      ep_hndl,
+               IN uint64_t             datagram_id,
+               OUT gni_post_state_t    *post_state,
+               OUT uint32_t            *remote_addr,
+               OUT uint32_t            *remote_id
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_TEST)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_NOMEM;
+       } else {
+               rrc = gni_ep_postdata_test_by_id(
+                       ep_hndl, datagram_id, post_state, remote_addr,
+                       remote_id);
+
+               /* we want to lie, but we need to do the actual work first
+                * so we don't keep getting the event saying a dgram is ready */
+               if (rrc == GNI_RC_SUCCESS && CFS_FAIL_CHECK(CFS_FAIL_GNI_DG_TERMINATE)) {
+                       /* don't use fail_val, allows us to do FAIL_SOME */
+                       *post_state = GNI_POST_TERMINATED;
+               }
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NO_MATCH:
+               break;
+       case GNI_RC_SIZE_ERROR:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, datagram_id, post_state, remote_addr,
+                       remote_id);
+               break;
+       case GNI_RC_ERROR_NOMEM:
+               GNILND_API_RESOURCE(
+                       ep_hndl, datagram_id, post_state, remote_addr,
+                       remote_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, datagram_id, post_state, remote_addr,
+                       remote_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_cancel_by_id"
+#define apick_fmt "0x%p, "LPU64""
+static inline gni_return_t kgnilnd_ep_postdata_cancel_by_id(
+               IN gni_ep_handle_t      ep_hndl,
+               IN uint64_t             datagram_id
+               )
+{
+       gni_return_t rrc;
+
+       /* no error injection as the only thing we'd do is LBUG */
+
+       rrc = gni_ep_postdata_cancel_by_id(
+               ep_hndl, datagram_id);
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NO_MATCH:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, datagram_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, datagram_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_postdata_probe_by_id"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_postdata_probe_by_id(
+               IN gni_nic_handle_t    nic_hndl,
+               OUT uint64_t          *datagram_id
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PROBE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NO_MATCH;
+       } else {
+               rrc = gni_postdata_probe_by_id(
+                       nic_hndl, datagram_id);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NO_MATCH:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, datagram_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, datagram_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_postdata_probe_wait_by_id"
+#define apick_fmt "0x%p, %d, 0x%p"
+static inline gni_return_t kgnilnd_postdata_probe_wait_by_id(
+               IN gni_nic_handle_t nic_hndl,
+               IN uint32_t         timeout,
+               OUT uint64_t        *datagram_id
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PROBE_WAIT)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_TIMEOUT;
+       } else {
+               rrc = gni_postdata_probe_wait_by_id(
+                       nic_hndl, timeout, datagram_id);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_TIMEOUT:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, timeout, datagram_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, timeout, datagram_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_post_rdma"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_post_rdma(
+               IN gni_ep_handle_t               ep_hndl,
+               IN gni_post_descriptor_t        *post_descr
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_POST_RDMA)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_post_rdma(
+                       ep_hndl, post_descr);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_ALIGNMENT_ERROR:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, post_descr);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_RESOURCE(
+                       ep_hndl, post_descr);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, post_descr);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_get_completed"
+#define apick_fmt "0x%p,"LPX64",0x%p"
+static inline gni_return_t kgnilnd_get_completed(
+               IN gni_cq_handle_t              cq_hndl,
+               IN gni_cq_entry_t               event_data,
+               OUT gni_post_descriptor_t       **post_descr
+               )
+{
+       gni_return_t rrc;
+
+
+       rrc = gni_get_completed(cq_hndl, event_data, post_descr);
+
+       switch (rrc)  {
+       case GNI_RC_TRANSACTION_ERROR:
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_DESCRIPTOR_ERROR:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(cq_hndl, event_data, post_descr);
+               break;
+       default:
+               GNILND_API_RC_LBUG(cq_hndl, event_data, post_descr);
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+
+       /* Error injection - we need a valid desc, so let kgni give us one
+        * - then we lie  */
+       if (rrc == GNI_RC_SUCCESS &&
+           (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED))) {
+               /* We only trigger TRANSACTION_ERROR for now */
+               gni_post_descriptor_t *desc;
+               rrc = GNI_RC_TRANSACTION_ERROR;
+               desc = *post_descr;
+               desc->status = rrc;
+               /* recoverable decision made from cfs_fail_val in
+                *  kgnilnd_cq_error_str and
+                *  kgnilnd_cq_error_recoverable */
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_error_str"
+#define apick_fmt LPX64",0x%p,%d"
+static inline gni_return_t kgnilnd_cq_error_str(
+               IN gni_cq_entry_t       entry,
+               IN void                *buffer,
+               IN uint32_t             len
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection - set string if we injected a
+        *  TRANSACTION_ERROR earlier */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED)) {
+               /* if we just set persistent error, we can't ever
+                * break in via ssh to clear, so use a count > 10 to indicate fatal */
+               sprintf(buffer, "INJECT:%s", cfs_fail_val > 10 ?
+                       "FATAL" : "RECOVERABLE");
+               rrc = GNI_RC_SUCCESS;
+       } else {
+               rrc = gni_cq_error_str(
+                       entry, buffer, len);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_SIZE_ERROR:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       entry, buffer, len);
+               /* give them something to use */
+               snprintf(buffer, len, "UNDEF:UNDEF");
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       entry, buffer, len);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_error_recoverable"
+#define apick_fmt LPX64",0x%p"
+static inline gni_return_t kgnilnd_cq_error_recoverable(
+               IN gni_cq_entry_t       entry,
+               IN uint32_t            *recoverable
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection - set string if we injected a
+        *  TRANSACTION_ERROR earlier */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED)) {
+               *recoverable = cfs_fail_val > 10 ? 0 : 1;
+               rrc = GNI_RC_SUCCESS;
+       } else {
+               rrc = gni_cq_error_recoverable(
+                       entry, recoverable);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_STATE:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       entry, recoverable);
+               *recoverable = 0;
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       entry, recoverable);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_register_segments"
+#define apick_fmt "0x%p,0x%p,%u,0x%p,%x,0x%p"
+static inline gni_return_t
+kgnilnd_mem_register_segments(
+               IN gni_nic_handle_t     nic_hndl,
+               IN gni_mem_segment_t    *mem_segments,
+               IN uint32_t             segments_cnt,
+               IN gni_cq_handle_t      dst_cq_hndl,
+               IN uint32_t             flags,
+               OUT gni_mem_handle_t    *mem_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_MAP)) {
+               rrc = GNI_RC_ERROR_RESOURCE;
+       } else {
+               rrc = gni_mem_register_segments(
+                       nic_hndl, mem_segments, segments_cnt,
+                       dst_cq_hndl, flags, mem_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_ERROR_RESOURCE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, mem_segments, segments_cnt,
+                       dst_cq_hndl, flags, mem_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, mem_segments, segments_cnt,
+                       dst_cq_hndl, flags, mem_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_register"
+#define apick_fmt "0x%p,"LPX64","LPX64"0x%p,%u,0x%p"
+static inline gni_return_t kgnilnd_mem_register(
+               IN gni_nic_handle_t     nic_hndl,
+               IN uint64_t             address,
+               IN uint64_t             length,
+               IN gni_cq_handle_t      dst_cq_hndl,
+               IN uint32_t             flags,
+               OUT gni_mem_handle_t    *mem_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_MAP)) {
+               rrc = GNI_RC_ERROR_RESOURCE;
+       } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_SMALL_MAP) &&
+                  length <= *kgnilnd_tunables.kgn_max_immediate) {
+               rrc = GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_mem_register(
+                       nic_hndl, address, length,
+                       dst_cq_hndl, flags, mem_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_ERROR_RESOURCE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, address, length,
+                       dst_cq_hndl, flags, mem_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, address, length,
+                       dst_cq_hndl, flags, mem_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_deregister"
+#define apick_fmt "0x%p,0x%p,%d"
+static inline gni_return_t kgnilnd_mem_deregister(
+               IN gni_nic_handle_t     nic_hndl,
+               IN gni_mem_handle_t     *mem_hndl,
+               IN int                  hold_timeout
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_UNMAP)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_mem_deregister(
+                       nic_hndl, mem_hndl, hold_timeout);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, mem_hndl, hold_timeout);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, mem_hndl, hold_timeout);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_mdd_release"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_mem_mdd_release(
+               IN gni_nic_handle_t     nic_hndl,
+               IN gni_mem_handle_t     *mem_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_MDD_RELEASE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NO_MATCH;
+       } else {
+               rrc = gni_mem_mdd_release(
+                       nic_hndl, mem_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NO_MATCH:
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, mem_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#endif /* _GNILND_API_WRAP_H */
diff --git a/lnet/klnds/gnilnd/gnilnd_cb.c b/lnet/klnds/gnilnd/gnilnd_cb.c
new file mode 100644 (file)
index 0000000..56be88a
--- /dev/null
@@ -0,0 +1,4366 @@
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Derived from work by Eric Barton <eric@bartonsoftware.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/nmi.h>
+#include "gnilnd.h"
+
+/* this is useful when needed to debug wire corruption. */
+static void
+kgnilnd_dump_blob(int level, char *prefix, void *buf, int len) {
+       __u64 *ptr;
+
+       ptr = (__u64 *) buf;
+
+       while (len > 0) {
+               if (len >= 32) {
+                       CDEBUG(level,
+                              "%s 0x%p: 0x%16.16llx 0x%16.16llx 0x%16.16llx 0x%16.16llx\n",
+                              prefix, ptr, *(ptr), *(ptr + 1), *(ptr + 2), *(ptr + 3));
+                       ptr += 4;
+                       len -= 32;
+               } else if (len >= 16) {
+                       CDEBUG(level,
+                              "%s 0x%p: 0x%16.16llx 0x%16.16llx\n",
+                              prefix, ptr, *(ptr), *(ptr + 1));
+                       ptr += 2;
+                       len -= 16;
+               } else {
+                       CDEBUG(level, "%s 0x%p: 0x%16.16llx\n",
+                              prefix, ptr, *(ptr));
+                       ptr++;
+                       len -= 8;
+               }
+       }
+}
+
+static void
+kgnilnd_dump_msg(int mask, kgn_msg_t *msg)
+{
+       CDEBUG(mask, "0x%8.8x 0x%4.4x 0x%4.4x 0x%16.16llx"
+               " 0x%16.16llx 0x%8.8x 0x%4.4x 0x%4.4x 0x%8.8x\n",
+               msg->gnm_magic, msg->gnm_version,
+               msg->gnm_type, msg->gnm_srcnid,
+               msg->gnm_connstamp, msg->gnm_seq,
+               msg->gnm_cksum, msg->gnm_payload_cksum,
+               msg->gnm_payload_len);
+}
+
+void
+kgnilnd_schedule_device(kgn_device_t *dev)
+{
+       short         already_live = 0;
+
+       /* we'll only want to wake if the scheduler thread
+        * has come around and set ready to zero */
+       already_live = cmpxchg(&dev->gnd_ready, GNILND_DEV_IDLE, GNILND_DEV_IRQ);
+
+       if (!already_live) {
+               wake_up_all(&dev->gnd_waitq);
+       }
+       return;
+}
+
+void kgnilnd_schedule_device_timer(unsigned long arg)
+{
+       kgn_device_t *dev = (kgn_device_t *) arg;
+
+       kgnilnd_schedule_device(dev);
+}
+
+void
+kgnilnd_device_callback(__u32 devid, __u64 arg)
+{
+       kgn_device_t *dev;
+       int           index = (int) arg;
+
+       if (index >= kgnilnd_data.kgn_ndevs) {
+               /* use _EMERG instead of an LBUG to prevent LBUG'ing in
+                * interrupt context. */
+               LCONSOLE_EMERG("callback for unknown device %d->%d\n",
+                               devid, index);
+               return;
+       }
+
+       dev = &kgnilnd_data.kgn_devices[index];
+       /* just basic sanity */
+       if (dev->gnd_id == devid) {
+               kgnilnd_schedule_device(dev);
+       } else {
+               LCONSOLE_EMERG("callback for bad device %d devid %d\n",
+                               dev->gnd_id, devid);
+       }
+}
+
+/* sched_intent values:
+ * < 0 : do not reschedule under any circumstances
+ * == 0: reschedule if someone marked him WANTS_SCHED
+ * > 0 : force a reschedule */
+
+void
+kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent)
+{
+       int     conn_sched;
+
+       /* move back to IDLE but save previous state.
+        * if we see WANTS_SCHED, we'll call kgnilnd_schedule_conn and
+        * let the xchg there handle any racing callers to get it
+        * onto gnd_ready_conns */
+
+       conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_IDLE);
+       LASSERTF(conn_sched == GNILND_CONN_WANTS_SCHED ||
+                conn_sched == GNILND_CONN_PROCESS,
+                "conn %p after process in bad state: %d\n",
+                conn, conn_sched);
+
+       if (sched_intent >= 0) {
+               if ((sched_intent > 0 || (conn_sched == GNILND_CONN_WANTS_SCHED))) {
+                       kgnilnd_schedule_conn(conn);
+               }
+       }
+}
+
+void
+kgnilnd_schedule_conn(kgn_conn_t *conn)
+{
+       kgn_device_t        *dev = conn->gnc_device;
+       int                  sched;
+
+       sched = xchg(&conn->gnc_scheduled, GNILND_CONN_WANTS_SCHED);
+
+       /* if we are IDLE, add to list - only one guy sees IDLE and "wins"
+        * the chance to put it onto gnd_ready_conns.
+        * otherwise, leave marked as WANTS_SCHED and the thread that "owns"
+        *  the conn in process_conns will take care of moving it back to
+        *  SCHED when it is done processing */
+
+       if (sched == GNILND_CONN_IDLE) {
+               /* if the conn is already scheduled, we've already requested
+                * the scheduler thread wakeup */
+               kgnilnd_conn_addref(conn);       /* +1 ref for scheduler */
+
+               LASSERTF(list_empty(&conn->gnc_schedlist), "conn %p already sched state %d\n",
+                        conn, sched);
+
+               CDEBUG(D_INFO, "scheduling conn 0x%p\n", conn);
+
+               spin_lock(&dev->gnd_lock);
+               list_add_tail(&conn->gnc_schedlist, &dev->gnd_ready_conns);
+               spin_unlock(&dev->gnd_lock);
+               set_mb(conn->gnc_last_sched_ask, jiffies);
+
+       } else {
+               CDEBUG(D_INFO, "not scheduling conn 0x%p: %d\n", conn, sched);
+       }
+
+       /* make sure thread(s) going to process conns - but let it make
+        * separate decision from conn schedule */
+       kgnilnd_schedule_device(dev);
+}
+
+void
+kgnilnd_schedule_dgram(kgn_device_t *dev)
+{
+       int                  wake;
+
+       wake = xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_SCHED);
+       if (wake != GNILND_DGRAM_SCHED)  {
+               wake_up(&dev->gnd_dgram_waitq);
+       } else {
+               CDEBUG(D_NETTRACE, "not waking: %d\n", wake);
+       }
+}
+
+void
+kgnilnd_free_tx(kgn_tx_t *tx)
+{
+       /* taken from kgnilnd_tx_add_state_locked */
+
+       LASSERTF((tx->tx_list_p == NULL &&
+                 tx->tx_list_state == GNILND_TX_ALLOCD) &&
+               list_empty(&tx->tx_list),
+               "tx %p with bad state %s (list_p %p) tx_list %s\n",
+               tx, kgnilnd_tx_state2str(tx->tx_list_state), tx->tx_list_p,
+               list_empty(&tx->tx_list) ? "empty" : "not empty");
+
+       atomic_dec(&kgnilnd_data.kgn_ntx);
+
+       /* we only allocate this if we need to */
+       if (tx->tx_phys != NULL) {
+               cfs_mem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys);
+               CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n",
+                      LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys);
+       }
+#if 0
+       KGNILND_POISON(tx, 0x5a, sizeof(kgn_tx_t));
+#endif
+       cfs_mem_cache_free(kgnilnd_data.kgn_tx_cache, tx);
+       CDEBUG(D_MALLOC, "slab-freed 'tx': %lu at %p.\n",
+              sizeof(*tx), tx);
+}
+
+kgn_tx_t *
+kgnilnd_alloc_tx(void)
+{
+       kgn_tx_t      *tx = NULL;
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ALLOC_TX))
+               return tx;
+
+       tx = cfs_mem_cache_alloc(kgnilnd_data.kgn_tx_cache, CFS_ALLOC_ATOMIC);
+       if (tx == NULL) {
+               CERROR("failed to allocate tx\n");
+               return NULL;
+       }
+       CDEBUG(D_MALLOC, "slab-alloced 'tx': %lu at %p.\n",
+              sizeof(*tx), tx);
+
+       /* need this memset, cache alloc'd memory is not cleared */
+       memset(tx, 0, sizeof(*tx));
+
+       /* setup everything here to minimize time under the lock */
+       tx->tx_buftype = GNILND_BUF_NONE;
+       tx->tx_msg.gnm_type = GNILND_MSG_NONE;
+       INIT_LIST_HEAD(&tx->tx_list);
+       INIT_LIST_HEAD(&tx->tx_map_list);
+       tx->tx_list_state = GNILND_TX_ALLOCD;
+
+       atomic_inc(&kgnilnd_data.kgn_ntx);
+
+       return tx;
+}
+
+/* csum_fold needs to be run on the return value before shipping over the wire */
+#define _kgnilnd_cksum(seed, ptr, nob)  csum_partial(ptr, nob, seed)
+
+/* we don't use offset as every one is passing a buffer reference that already
+ * includes the offset into the base address -
+ *  see kgnilnd_setup_virt_buffer and kgnilnd_setup_immediate_buffer */
+static inline __u16
+kgnilnd_cksum(void *ptr, size_t nob)
+{
+       __u16   sum;
+
+       sum = csum_fold(_kgnilnd_cksum(0, ptr, nob));
+
+       /* don't use magic 'no checksum' value */
+       if (sum == 0)
+               sum = 1;
+
+       CDEBUG(D_INFO, "cksum 0x%x for ptr 0x%p sz %zu\n",
+              sum, ptr, nob);
+
+       return sum;
+}
+
+inline __u16
+kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov,
+                   unsigned int offset, unsigned int nob, int dump_blob)
+{
+       __wsum             cksum = 0;
+       __wsum             tmpck;
+       __u16              retsum;
+       void              *addr;
+       unsigned int       fraglen;
+       int                i, odd;
+
+       LASSERT(nkiov > 0);
+       LASSERT(nob > 0);
+
+       CDEBUG(D_BUFFS, "calc cksum for kiov 0x%p nkiov %u offset %u nob %u, dump %d\n",
+              kiov, nkiov, offset, nob, dump_blob);
+
+       /* if loops changes, please change kgnilnd_setup_phys_buffer */
+
+       while (offset >= kiov->kiov_len) {
+               offset -= kiov->kiov_len;
+               nkiov--;
+               kiov++;
+               LASSERT(nkiov > 0);
+       }
+
+       /* ignore nob here, if nob < (kiov_len - offset), kiov == 1 */
+       odd = (unsigned long) (kiov[0].kiov_len - offset) & 1;
+
+       if ((odd || *kgnilnd_tunables.kgn_vmap_cksum) && nkiov > 1) {
+               struct page **pages = kgnilnd_data.kgn_cksum_map_pages[get_cpu()];
+
+               LASSERTF(pages != NULL, "NULL pages for cpu %d map_pages 0x%p\n",
+                        get_cpu(), kgnilnd_data.kgn_cksum_map_pages);
+
+               CDEBUG(D_BUFFS, "odd %d len %u offset %u nob %u\n",
+                      odd, kiov[0].kiov_len, offset, nob);
+
+               for (i = 0; i < nkiov; i++) {
+                       pages[i] = kiov[i].kiov_page;
+               }
+
+               addr = vmap(pages, nkiov, VM_MAP, PAGE_KERNEL);
+               if (addr == NULL) {
+                       CNETERR("Couldn't vmap %d frags on %d bytes to avoid odd length fragment in cksum\n",
+                               nkiov, nob);
+                       /* return zero to avoid killing tx - we'll just get warning on console
+                        * when remote end sees zero checksum */
+                       RETURN(0);
+               }
+               atomic_inc(&kgnilnd_data.kgn_nvmap_cksum);
+
+               tmpck = _kgnilnd_cksum(0, (void *) addr + kiov[0].kiov_offset + offset, nob);
+               cksum = tmpck;
+
+               if (dump_blob) {
+                       kgnilnd_dump_blob(D_BUFFS, "flat kiov RDMA payload",
+                                         (void *)addr + kiov[0].kiov_offset + offset, nob);
+               }
+               CDEBUG(D_BUFFS, "cksum 0x%x (+0x%x) for addr 0x%p+%u len %u offset %u\n",
+                      cksum, tmpck, addr, kiov[0].kiov_offset, nob, offset);
+               vunmap(addr);
+       } else {
+               do {
+                       fraglen = min(kiov->kiov_len - offset, nob);
+
+                       /* make dang sure we don't send a bogus checksum if somehow we get
+                        * an odd length fragment on anything but the last entry in a kiov  -
+                        * we know from kgnilnd_setup_rdma_buffer that we can't have non
+                        * PAGE_SIZE pages in the middle, so if nob < PAGE_SIZE, it is the last one */
+                       LASSERTF(!(fraglen&1) || (nob < PAGE_SIZE),
+                                "odd fraglen %u on nkiov %d, nob %u kiov_len %u offset %u kiov 0x%p\n",
+                                fraglen, nkiov, nob, kiov->kiov_len, offset, kiov);
+
+                       addr = (void *)kmap(kiov->kiov_page) + kiov->kiov_offset + offset;
+                       tmpck = _kgnilnd_cksum(cksum, addr, fraglen);
+
+                       CDEBUG(D_BUFFS,
+                              "cksum 0x%x (+0x%x) for page 0x%p+%u (0x%p) len %u offset %u\n",
+                              cksum, tmpck, kiov->kiov_page, kiov->kiov_offset, addr,
+                              fraglen, offset);
+
+                       cksum = tmpck;
+
+                       if (dump_blob)
+                               kgnilnd_dump_blob(D_BUFFS, "kiov cksum", addr, fraglen);
+
+                       kunmap(kiov->kiov_page);
+
+                       kiov++;
+                       nkiov--;
+                       nob -= fraglen;
+                       offset = 0;
+
+                       /* iov must not run out before end of data */
+                       LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov);
+
+               } while (nob > 0);
+       }
+
+       retsum = csum_fold(cksum);
+
+       /* don't use magic 'no checksum' value */
+       if (retsum == 0)
+               retsum = 1;
+
+       CDEBUG(D_BUFFS, "retsum 0x%x from cksum 0x%x\n", retsum, cksum);
+
+       return retsum;
+}
+
+void
+kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source)
+{
+       msg->gnm_magic = GNILND_MSG_MAGIC;
+       msg->gnm_version = GNILND_MSG_VERSION;
+       msg->gnm_type = type;
+       msg->gnm_payload_len = 0;
+       msg->gnm_srcnid = source;
+       /* gnm_connstamp gets set when FMA is sent */
+       /* gnm_srcnid is set on creation via function argument
+        * The right interface/net and nid is passed in when the message
+        * is created.
+        */
+}
+
+kgn_tx_t *
+kgnilnd_new_tx_msg(int type, lnet_nid_t source)
+{
+       kgn_tx_t *tx = kgnilnd_alloc_tx();
+
+       if (tx != NULL) {
+               kgnilnd_init_msg(&tx->tx_msg, type, source);
+       } else {
+               CERROR("couldn't allocate new tx type %s!\n",
+                      kgnilnd_msgtype2str(type));
+       }
+
+       return tx;
+}
+
+static void
+kgnilnd_nak_rdma(kgn_conn_t *conn, int type, int error, __u64 cookie, lnet_nid_t source) {
+       kgn_tx_t        *tx;
+
+       /* only allow NAK on error and truncate to zero */
+       LASSERTF(error <= 0, "error %d conn 0x%p, cookie "LPU64"\n",
+                error, conn, cookie);
+
+       tx = kgnilnd_new_tx_msg(type, source);
+       if (tx == NULL) {
+               CNETERR("can't get TX to NAK RDMA to %s\n",
+                       libcfs_nid2str(conn->gnc_peer->gnp_nid));
+               return;
+       }
+
+       tx->tx_msg.gnm_u.completion.gncm_retval = error;
+       tx->tx_msg.gnm_u.completion.gncm_cookie = cookie;
+       kgnilnd_queue_tx(conn, tx);
+}
+
+int
+kgnilnd_setup_immediate_buffer(kgn_tx_t *tx, unsigned int niov, struct iovec *iov,
+                              lnet_kiov_t *kiov, unsigned int offset, unsigned int nob)
+
+{
+       kgn_msg_t       *msg = &tx->tx_msg;
+       int              i;
+
+       /* To help save on MDDs for short messages, we'll vmap a kiov to allow
+        * gni_smsg_send to send that as the payload */
+
+       LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+       LASSERT(nob >= 0);
+
+       if (nob == 0) {
+               tx->tx_buffer = NULL;
+       } else if (kiov != NULL) {
+               LASSERTF(niov > 0 && niov < GNILND_MAX_IMMEDIATE/PAGE_SIZE,
+                        "bad niov %d\n", niov);
+
+               while (offset >= kiov->kiov_len) {
+                       offset -= kiov->kiov_len;
+                       niov--;
+                       kiov++;
+                       LASSERT(niov > 0);
+               }
+               for (i = 0; i < niov; i++) {
+                       /* We can't have a kiov_offset on anything but the first entry,
+                        * otherwise we'll have a hole at the end of the mapping as we only map
+                        * whole pages.
+                        * Also, if we have a kiov_len < PAGE_SIZE but we need to map more
+                        * than kiov_len, we will also have a whole at the end of that page
+                        * which isn't allowed */
+                       if ((kiov[i].kiov_offset != 0 && i > 0) ||
+                           (kiov[i].kiov_offset + kiov[i].kiov_len != CFS_PAGE_SIZE && i < niov - 1)) {
+                               CNETERR("Can't make payload contiguous in I/O VM:"
+                                      "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n",
+                                      i, offset, nob, kiov->kiov_offset, kiov->kiov_len);
+                               RETURN(-EINVAL);
+                       }
+                       tx->tx_imm_pages[i] = kiov[i].kiov_page;
+               }
+
+               /* hijack tx_phys for the later unmap */
+               if (niov == 1) {
+                       /* tx->phyx being equal to NULL is the signal for unmap to discern between kmap and vmap */
+                       tx->tx_phys = NULL;
+                       tx->tx_buffer = (void *)kmap(tx->tx_imm_pages[0]) + kiov[0].kiov_offset + offset;
+                       atomic_inc(&kgnilnd_data.kgn_nkmap_short);
+                       GNIDBG_TX(D_NET, tx, "kmapped page for %d bytes for kiov 0x%p, buffer 0x%p",
+                               nob, kiov, tx->tx_buffer);
+               } else {
+                       tx->tx_phys = vmap(tx->tx_imm_pages, niov, VM_MAP, PAGE_KERNEL);
+                       if (tx->tx_phys == NULL) {
+                               CNETERR("Couldn't vmap %d frags on %d bytes\n", niov, nob);
+                               RETURN(-ENOMEM);
+
+                       }
+                       atomic_inc(&kgnilnd_data.kgn_nvmap_short);
+                       /* make sure we take into account the kiov offset as the start of the buffer */
+                       tx->tx_buffer = (void *)tx->tx_phys + kiov[0].kiov_offset + offset;
+                       GNIDBG_TX(D_NET, tx, "mapped %d pages for %d bytes from kiov 0x%p to 0x%p, buffer 0x%p",
+                               niov, nob, kiov, tx->tx_phys, tx->tx_buffer);
+               }
+               tx->tx_buftype = GNILND_BUF_IMMEDIATE_KIOV;
+               tx->tx_nob = nob;
+
+       } else {
+               /* For now this is almost identical to kgnilnd_setup_virt_buffer, but we
+                * could "flatten" the payload into a single contiguous buffer ready
+                * for sending direct over an FMA if we ever needed to. */
+
+               LASSERT(niov > 0);
+
+               while (offset >= iov->iov_len) {
+                       offset -= iov->iov_len;
+                       niov--;
+                       iov++;
+                       LASSERT(niov > 0);
+               }
+
+               if (nob > iov->iov_len - offset) {
+                       CERROR("Can't handle multiple vaddr fragments\n");
+                       return -EMSGSIZE;
+               }
+
+               tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset);
+
+               tx->tx_buftype = GNILND_BUF_IMMEDIATE;
+               tx->tx_nob = nob;
+       }
+
+       /* checksum payload early - it shouldn't be changing after lnd_send */
+       if (*kgnilnd_tunables.kgn_checksum >= 2) {
+               msg->gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM2)) {
+                       msg->gnm_payload_cksum += 0xe00e;
+               }
+               if (*kgnilnd_tunables.kgn_checksum_dump > 1) {
+                       kgnilnd_dump_blob(D_BUFFS, "payload checksum",
+                                         tx->tx_buffer, nob);
+               }
+       } else {
+               msg->gnm_payload_cksum = 0;
+       }
+
+       return 0;
+}
+
+int
+kgnilnd_setup_virt_buffer(kgn_tx_t *tx,
+                         unsigned int niov, struct iovec *iov,
+                         unsigned int offset, unsigned int nob)
+
+{
+       LASSERT(nob > 0);
+       LASSERT(niov > 0);
+       LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+
+       while (offset >= iov->iov_len) {
+               offset -= iov->iov_len;
+               niov--;
+               iov++;
+               LASSERT(niov > 0);
+       }
+
+       if (nob > iov->iov_len - offset) {
+               CERROR("Can't handle multiple vaddr fragments\n");
+               return -EMSGSIZE;
+       }
+
+       tx->tx_buftype = GNILND_BUF_VIRT_UNMAPPED;
+       tx->tx_nob = nob;
+       tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset);
+       return 0;
+}
+
+int
+kgnilnd_setup_phys_buffer(kgn_tx_t *tx, int nkiov, lnet_kiov_t *kiov,
+                         unsigned int offset, unsigned int nob)
+{
+       gni_mem_segment_t *phys;
+       int                rc = 0;
+       unsigned int       fraglen;
+
+       GNIDBG_TX(D_NET, tx, "niov %d kiov 0x%p offset %u nob %u", nkiov, kiov, offset, nob);
+
+       LASSERT(nob > 0);
+       LASSERT(nkiov > 0);
+       LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+
+       /* only allocate this if we are going to use it */
+       tx->tx_phys = cfs_mem_cache_alloc(kgnilnd_data.kgn_tx_phys_cache,
+                                         CFS_ALLOC_ATOMIC);
+       if (tx->tx_phys == NULL) {
+               CERROR("failed to allocate tx_phys\n");
+               rc = -ENOMEM;
+               GOTO(error, rc);
+       }
+
+       CDEBUG(D_MALLOC, "slab-alloced 'tx->tx_phys': %lu at %p.\n",
+              LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys);
+
+       /* if loops changes, please change kgnilnd_cksum_kiov
+        *   and kgnilnd_setup_immediate_buffer */
+
+       while (offset >= kiov->kiov_len) {
+               offset -= kiov->kiov_len;
+               nkiov--;
+               kiov++;
+               LASSERT(nkiov > 0);
+       }
+
+       /* at this point, kiov points to the first page that we'll actually map
+        * now that we've seeked into the koiv for offset and dropped any
+        * leading pages that fall entirely within the offset */
+       tx->tx_buftype = GNILND_BUF_PHYS_UNMAPPED;
+       tx->tx_nob = nob;
+
+       /* kiov_offset is start of 'valid' buffer, so index offset past that */
+       tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset));
+       phys = tx->tx_phys;
+
+       CDEBUG(D_NET, "tx 0x%p buffer 0x%p map start kiov 0x%p+%u niov %d offset %u\n",
+              tx, tx->tx_buffer, kiov, kiov->kiov_offset, nkiov, offset);
+
+       do {
+               fraglen = min(kiov->kiov_len - offset, nob);
+
+               /* We can't have a kiov_offset on anything but the first entry,
+                * otherwise we'll have a hole at the end of the mapping as we only map
+                * whole pages. Only the first page is allowed to have an offset -
+                * we'll add that into tx->tx_buffer and that will get used when we
+                * map in the segments (see kgnilnd_map_buffer).
+                * Also, if we have a kiov_len < PAGE_SIZE but we need to map more
+                * than kiov_len, we will also have a whole at the end of that page
+                * which isn't allowed */
+               if ((phys != tx->tx_phys) &&
+                   ((kiov->kiov_offset != 0) ||
+                    ((kiov->kiov_len < PAGE_SIZE) && (nob > kiov->kiov_len)))) {
+                       CERROR("Can't make payload contiguous in I/O VM:"
+                              "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n",
+                              (int)(phys - tx->tx_phys),
+                              offset, nob, kiov->kiov_offset, kiov->kiov_len);
+                       rc = -EINVAL;
+                       GOTO(error, rc);
+               }
+
+               if ((phys - tx->tx_phys) == LNET_MAX_IOV) {
+                       CERROR ("payload too big (%d)\n", (int)(phys - tx->tx_phys));
+                       rc = -EMSGSIZE;
+                       GOTO(error, rc);
+               }
+
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_SETUP)) {
+                       rc = -EINVAL;
+                       GOTO(error, rc);
+               }
+
+               CDEBUG(D_BUFFS, "page 0x%p kiov_offset %u kiov_len %u nob %u "
+                              "nkiov %u offset %u\n",
+                     kiov->kiov_page, kiov->kiov_offset, kiov->kiov_len, nob, nkiov, offset);
+
+               phys->address = lnet_page2phys(kiov->kiov_page);
+               phys++;
+               kiov++;
+               nkiov--;
+               nob -= fraglen;
+               offset = 0;
+
+               /* iov must not run out before end of data */
+               LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov);
+
+       } while (nob > 0);
+
+       tx->tx_phys_npages = phys - tx->tx_phys;
+
+       return 0;
+
+error:
+       if (tx->tx_phys != NULL) {
+               cfs_mem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys);
+               CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n",
+                      sizeof(*tx->tx_phys), tx->tx_phys);
+               tx->tx_phys = NULL;
+       }
+       return rc;
+}
+
+static inline int
+kgnilnd_setup_rdma_buffer(kgn_tx_t *tx, unsigned int niov,
+                         struct iovec *iov, lnet_kiov_t *kiov,
+                         unsigned int offset, unsigned int nob)
+{
+       int     rc;
+
+       LASSERT((iov == NULL) != (kiov == NULL));
+
+       if (kiov != NULL) {
+               rc = kgnilnd_setup_phys_buffer(tx, niov, kiov, offset, nob);
+       } else {
+               rc = kgnilnd_setup_virt_buffer(tx, niov, iov, offset, nob);
+       }
+       return rc;
+}
+
+static void
+kgnilnd_parse_lnet_rdma(lnet_msg_t *lntmsg, unsigned int *niov, unsigned int *offset,
+                       unsigned int *nob, lnet_kiov_t **kiov)
+{
+       /* GETs are weird, see kgnilnd_send */
+       if (lntmsg->msg_type == LNET_MSG_GET) {
+               if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) {
+                       *kiov = NULL;
+               } else {
+                       *kiov = lntmsg->msg_md->md_iov.kiov;
+               }
+               *niov = lntmsg->msg_md->md_niov;
+               *nob = lntmsg->msg_md->md_length;
+               *offset = 0;
+       } else {
+               *kiov = lntmsg->msg_kiov;
+               *niov = lntmsg->msg_niov;
+               *nob = lntmsg->msg_len;
+               *offset = lntmsg->msg_offset;
+       }
+}
+
+static inline void
+kgnilnd_compute_rdma_cksum(kgn_tx_t *tx)
+{
+       unsigned int     niov, offset, nob;
+       lnet_kiov_t     *kiov;
+       lnet_msg_t      *lntmsg = tx->tx_lntmsg[0];
+       int              dump_cksum = (*kgnilnd_tunables.kgn_checksum_dump > 1);
+
+       GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE) ||
+                          (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE)),
+                     "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type));
+
+
+       if (*kgnilnd_tunables.kgn_checksum < 3) {
+               tx->tx_msg.gnm_payload_cksum = 0;
+               return;
+       }
+
+       GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL);
+
+       kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov);
+
+       if (kiov != NULL) {
+               tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, dump_cksum);
+       } else {
+               tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+               if (dump_cksum) {
+                       kgnilnd_dump_blob(D_BUFFS, "peer RDMA payload", tx->tx_buffer, nob);
+               }
+       }
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM3)) {
+               tx->tx_msg.gnm_payload_cksum += 0xd00d;
+       }
+}
+
+static inline int
+kgnilnd_verify_rdma_cksum(kgn_tx_t *tx, __u16 rx_cksum)
+{
+       int              rc = 0;
+       __u16            cksum;
+       unsigned int     niov, offset, nob;
+       lnet_kiov_t     *kiov;
+       lnet_msg_t      *lntmsg = tx->tx_lntmsg[0];
+       int dump_on_err = *kgnilnd_tunables.kgn_checksum_dump;
+
+       /* we can only match certain requests */
+       GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) ||
+                          (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK)),
+                     "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type));
+
+       if (rx_cksum == 0)  {
+               if (*kgnilnd_tunables.kgn_checksum >= 3) {
+                       GNIDBG_MSG(D_WARNING, &tx->tx_msg,
+                                  "no RDMA payload checksum when enabled");
+               }
+               return 0;
+       }
+
+       GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL);
+
+       kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov);
+
+       if (kiov != NULL) {
+               cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, 0);
+       } else {
+               cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+       }
+
+       if (cksum != rx_cksum) {
+               GNIDBG_MSG(D_NETERROR, &tx->tx_msg,
+                          "Bad RDMA payload checksum (%x expected %x); "
+                          "kiov 0x%p niov %d nob %u offset %u",
+                           cksum, rx_cksum, kiov, niov, nob, offset);
+               switch (dump_on_err) {
+               case 2:
+                       if (kiov != NULL) {
+                               kgnilnd_cksum_kiov(niov, kiov, offset, nob, 1);
+                       } else {
+                               kgnilnd_dump_blob(D_BUFFS, "RDMA payload",
+                                                 tx->tx_buffer, nob);
+                       }
+                       /* fall through to dump log */
+               case 1:
+                       libcfs_debug_dumplog();
+                       break;
+               default:
+                       break;
+               }
+               rc = -ENOKEY;
+               /* kgnilnd_check_fma_rx will close conn, kill tx with error */
+       }
+       return rc;
+}
+
+void
+kgnilnd_mem_add_map_list(kgn_device_t *dev, kgn_tx_t *tx)
+{
+       int     bytes;
+
+       GNITX_ASSERTF(tx, list_empty(&tx->tx_map_list),
+               "already mapped!", NULL);
+
+       spin_lock(&dev->gnd_map_lock);
+       switch (tx->tx_buftype) {
+       default:
+               GNIDBG_TX(D_EMERG, tx,
+                       "SOFTWARE BUG: invalid mapping %d", tx->tx_buftype);
+               spin_unlock(&dev->gnd_map_lock);
+               LBUG();
+               break;
+
+       case GNILND_BUF_PHYS_MAPPED:
+               bytes = tx->tx_phys_npages * PAGE_SIZE;
+               dev->gnd_map_nphys++;
+               dev->gnd_map_physnop += tx->tx_phys_npages;
+               break;
+
+       case GNILND_BUF_VIRT_MAPPED:
+               bytes = tx->tx_nob;
+               dev->gnd_map_nvirt++;
+               dev->gnd_map_virtnob += tx->tx_nob;
+               break;
+       }
+
+       if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+           tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+               atomic64_add(bytes, &dev->gnd_rdmaq_bytes_out);
+               GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to "LPD64"",
+                         bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+       }
+
+       atomic_inc(&dev->gnd_n_mdd);
+       atomic64_add(bytes, &dev->gnd_nbytes_map);
+
+       /* clear retrans to prevent any SMSG goofiness as that code uses the same counter */
+       tx->tx_retrans = 0;
+
+       /* we only get here in the valid cases */
+       list_add_tail(&tx->tx_map_list, &dev->gnd_map_list);
+       dev->gnd_map_version++;
+       spin_unlock(&dev->gnd_map_lock);
+}
+
+void
+kgnilnd_mem_del_map_list(kgn_device_t *dev, kgn_tx_t *tx)
+{
+       int     bytes;
+
+       GNITX_ASSERTF(tx, !list_empty(&tx->tx_map_list),
+               "not mapped!", NULL);
+       spin_lock(&dev->gnd_map_lock);
+
+       switch (tx->tx_buftype) {
+       default:
+               GNIDBG_TX(D_EMERG, tx,
+                       "SOFTWARE BUG: invalid mapping %d", tx->tx_buftype);
+               spin_unlock(&dev->gnd_map_lock);
+               LBUG();
+               break;
+
+       case GNILND_BUF_PHYS_UNMAPPED:
+               bytes = tx->tx_phys_npages * PAGE_SIZE;
+               dev->gnd_map_nphys--;
+               dev->gnd_map_physnop -= tx->tx_phys_npages;
+               break;
+
+       case GNILND_BUF_VIRT_UNMAPPED:
+               bytes = tx->tx_nob;
+               dev->gnd_map_nvirt--;
+               dev->gnd_map_virtnob -= tx->tx_nob;
+               break;
+       }
+
+       if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+           tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+               atomic64_sub(bytes, &dev->gnd_rdmaq_bytes_out);
+               LASSERTF(atomic64_read(&dev->gnd_rdmaq_bytes_out) >= 0,
+                        "bytes_out negative! %ld\n", atomic64_read(&dev->gnd_rdmaq_bytes_out));
+               GNIDBG_TX(D_NETTRACE, tx, "rdma -- %d to "LPD64"",
+                         bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+       }
+
+       atomic_dec(&dev->gnd_n_mdd);
+       atomic64_sub(bytes, &dev->gnd_nbytes_map);
+
+       /* we only get here in the valid cases */
+       list_del_init(&tx->tx_map_list);
+       dev->gnd_map_version++;
+       spin_unlock(&dev->gnd_map_lock);
+}
+
+int
+kgnilnd_map_buffer(kgn_tx_t *tx)
+{
+       kgn_conn_t       *conn = tx->tx_conn;
+       kgn_device_t     *dev = conn->gnc_device;
+       __u32             flags = GNI_MEM_READWRITE;
+       gni_return_t      rrc;
+
+       /* The kgnilnd_mem_register(_segments) Gemini Driver functions can
+        * be called concurrently as there are internal locks that protect
+        * any data structures or HW resources. We just need to ensure
+        * that our concurrency doesn't result in the kgn_device_t
+        * getting nuked while we are in here */
+
+       LASSERTF(conn != NULL, "tx %p with NULL conn, someone forgot"
+               " to set tx_conn before calling %s\n", tx, __FUNCTION__);
+
+       if (unlikely(CFS_FAIL_CHECK(CFS_FAIL_GNI_MAP_TX)))
+               RETURN(-ENOMEM);
+
+       if (*kgnilnd_tunables.kgn_bte_relaxed_ordering) {
+               flags |= GNI_MEM_RELAXED_PI_ORDERING;
+       }
+
+       switch (tx->tx_buftype) {
+       default:
+               LBUG();
+
+       case GNILND_BUF_NONE:
+       case GNILND_BUF_IMMEDIATE:
+       case GNILND_BUF_IMMEDIATE_KIOV:
+       case GNILND_BUF_PHYS_MAPPED:
+       case GNILND_BUF_VIRT_MAPPED:
+               return 0;
+
+       case GNILND_BUF_PHYS_UNMAPPED:
+               GNITX_ASSERTF(tx, tx->tx_phys != NULL, "physical buffer not there!", NULL);
+               rrc = kgnilnd_mem_register_segments(dev->gnd_handle,
+                       tx->tx_phys, tx->tx_phys_npages, NULL,
+                       GNI_MEM_PHYS_SEGMENTS | flags,
+                       &tx->tx_map_key);
+               /* could race with other uses of the map counts, but this is ok
+                * - this needs to turn into a non-fatal error soon to allow
+                *  GART resource, etc starvation handling */
+               if (rrc != GNI_RC_SUCCESS) {
+                       GNIDBG_TX(D_NET, tx, "Can't map %d pages: dev %d "
+                               "phys %u pp %u, virt %u nob "LPU64"",
+                               tx->tx_phys_npages, dev->gnd_id,
+                               dev->gnd_map_nphys, dev->gnd_map_physnop,
+                               dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+                       RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL);
+               }
+
+               tx->tx_buftype = GNILND_BUF_PHYS_MAPPED;
+               kgnilnd_mem_add_map_list(dev, tx);
+               return 0;
+
+       case GNILND_BUF_VIRT_UNMAPPED:
+               rrc = kgnilnd_mem_register(dev->gnd_handle,
+                       (__u64)tx->tx_buffer, tx->tx_nob,
+                       NULL, flags, &tx->tx_map_key);
+               if (rrc != GNI_RC_SUCCESS) {
+                       GNIDBG_TX(D_NET, tx, "Can't map %u bytes: dev %d "
+                               "phys %u pp %u, virt %u nob "LPU64"",
+                               tx->tx_nob, dev->gnd_id,
+                               dev->gnd_map_nphys, dev->gnd_map_physnop,
+                               dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+                       RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL);
+               }
+
+               tx->tx_buftype = GNILND_BUF_VIRT_MAPPED;
+               kgnilnd_mem_add_map_list(dev, tx);
+               if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+                   tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+                       atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_out);
+                       GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to %ld\n",
+                              tx->tx_nob, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+               }
+
+               return 0;
+       }
+}
+
+void
+kgnilnd_add_purgatory_tx(kgn_tx_t *tx)
+{
+       kgn_conn_t                  *conn = tx->tx_conn;
+       kgn_mdd_purgatory_t         *gmp;
+
+       LIBCFS_ALLOC(gmp, sizeof(*gmp));
+       LASSERTF(gmp != NULL, "couldn't allocate MDD purgatory member;"
+               " asserting to avoid data corruption\n");
+
+       gmp->gmp_map_key = tx->tx_map_key;
+       atomic_inc(&conn->gnc_device->gnd_n_mdd_held);
+
+       /* ensure that we don't have a blank purgatory - indicating the
+        * conn is not already on purgatory lists - we'd never recover these
+        * MDD if that were the case */
+       GNITX_ASSERTF(tx, conn->gnc_in_purgatory,
+               "conn 0x%p->%s with NULL purgatory",
+               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+       /* link 'er up! - only place we really need to lock for
+        * concurrent access */
+       spin_lock(&conn->gnc_list_lock);
+       list_add_tail(&gmp->gmp_list, &conn->gnc_mdd_list);
+       spin_unlock(&conn->gnc_list_lock);
+}
+
+void
+kgnilnd_unmap_buffer(kgn_tx_t *tx, int error)
+{
+       kgn_device_t     *dev;
+       gni_return_t      rrc;
+       int               hold_timeout = 0;
+
+       /* code below relies on +1 relationship ... */
+       CLASSERT(GNILND_BUF_PHYS_MAPPED == (GNILND_BUF_PHYS_UNMAPPED + 1));
+       CLASSERT(GNILND_BUF_VIRT_MAPPED == (GNILND_BUF_VIRT_UNMAPPED + 1));
+
+       switch (tx->tx_buftype) {
+       default:
+               LBUG();
+
+       case GNILND_BUF_NONE:
+       case GNILND_BUF_IMMEDIATE:
+       case GNILND_BUF_PHYS_UNMAPPED:
+       case GNILND_BUF_VIRT_UNMAPPED:
+               break;
+       case GNILND_BUF_IMMEDIATE_KIOV:
+               if (tx->tx_phys != NULL) {
+                       vunmap(tx->tx_phys);
+               } else if (tx->tx_phys == NULL && tx->tx_buffer != NULL) {
+                       kunmap(tx->tx_imm_pages[0]);
+               }
+               /* clear to prevent kgnilnd_free_tx from thinking
+                * this is a RDMA descriptor */
+               tx->tx_phys = NULL;
+               break;
+
+       case GNILND_BUF_PHYS_MAPPED:
+       case GNILND_BUF_VIRT_MAPPED:
+               LASSERT(tx->tx_conn != NULL);
+
+               dev = tx->tx_conn->gnc_device;
+
+               /* only want to hold if we are closing conn without
+                * verified peer notification  - the theory is that
+                * a TX error can be communicated in all other cases */
+               if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED &&
+                   kgnilnd_check_purgatory_conn(tx->tx_conn)) {
+                       kgnilnd_add_purgatory_tx(tx);
+
+                       /* The timeout we give to kgni is a deadman stop only.
+                        *  we are setting high to ensure we don't have the kgni timer
+                        *  fire before ours fires _and_ is handled */
+                       hold_timeout = GNILND_TIMEOUT2DEADMAN;
+
+                       GNIDBG_TX(D_NET, tx,
+                                "dev %p delaying MDD release for %dms key "LPX64"."LPX64"",
+                                tx->tx_conn->gnc_device, hold_timeout,
+                                tx->tx_map_key.qword1, tx->tx_map_key.qword2);
+               }
+
+               rrc = kgnilnd_mem_deregister(dev->gnd_handle, &tx->tx_map_key, hold_timeout);
+
+               LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d\n", rrc);
+
+               tx->tx_buftype--;
+               kgnilnd_mem_del_map_list(dev, tx);
+               break;
+       }
+}
+
+void
+kgnilnd_tx_done(kgn_tx_t *tx, int completion)
+{
+       lnet_msg_t      *lntmsg0, *lntmsg1;
+       int             status0, status1;
+       lnet_ni_t       *ni = NULL;
+       kgn_conn_t      *conn = tx->tx_conn;
+
+       LASSERT(!in_interrupt());
+
+       lntmsg0 = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+       lntmsg1 = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+
+       if (completion &&
+           !(tx->tx_state & GNILND_TX_QUIET_ERROR) &&
+           !kgnilnd_conn_clean_errno(completion)) {
+               GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg,
+                      "error %d on tx 0x%p->%s id %u/%d state %s age %ds",
+                      completion, tx, conn ?
+                      libcfs_nid2str(conn->gnc_peer->gnp_nid) : "<?>",
+                      tx->tx_id.txe_smsg_id, tx->tx_id.txe_idx,
+                      kgnilnd_tx_state2str(tx->tx_list_state),
+                      cfs_duration_sec((long)jiffies - tx->tx_qtime));
+       }
+
+       /* The error codes determine if we hold onto the MDD */
+       kgnilnd_unmap_buffer(tx, completion);
+
+       /* we have to deliver a reply on lntmsg[1] for the GET, so make sure
+        * we play nice with the error codes to avoid delivering a failed
+        * REQUEST and then a REPLY event as well */
+
+       /* return -EIO to lnet - it is the magic value for failed sends */
+       if (tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+               status0 = 0;
+               status1 = completion;
+       } else {
+               status0 = status1 = completion;
+       }
+
+       tx->tx_buftype = GNILND_BUF_NONE;
+       tx->tx_msg.gnm_type = GNILND_MSG_NONE;
+
+       /* lnet_finalize doesn't do anything with the *ni, so ok for us to
+        * set NULL when we are a tx without a conn */
+       if (conn != NULL) {
+               ni = conn->gnc_peer->gnp_net->gnn_ni;
+
+               spin_lock(&conn->gnc_tx_lock);
+
+               LASSERTF(test_and_clear_bit(tx->tx_id.txe_idx,
+                       (volatile unsigned long *)&conn->gnc_tx_bits),
+                       "conn %p tx %p bit %d already cleared\n",
+                       conn, tx, tx->tx_id.txe_idx);
+
+               LASSERTF(conn->gnc_tx_ref_table[tx->tx_id.txe_idx] != NULL,
+                        "msg_id %d already NULL\n", tx->tx_id.txe_idx);
+
+               conn->gnc_tx_ref_table[tx->tx_id.txe_idx] = NULL;
+               spin_unlock(&conn->gnc_tx_lock);
+       }
+
+       kgnilnd_free_tx(tx);
+
+       /* finalize AFTER freeing lnet msgs */
+
+       /* warning - we should hold no locks here - calling lnet_finalize
+        * could free up lnet credits, resulting in a call chain back into
+        * the LND via kgnilnd_send and friends */
+       lnet_finalize(ni, lntmsg0, status0);
+
+       if (lntmsg1 != NULL) {
+               lnet_finalize(ni, lntmsg1, status1);
+       }
+}
+
+void
+kgnilnd_txlist_done(struct list_head *txlist, int error)
+{
+       kgn_tx_t        *tx, *txn;
+       int              err_printed = 0;
+
+       if (list_empty(txlist))
+               return;
+
+       list_for_each_entry_safe(tx, txn, txlist, tx_list) {
+               /* only print the first error */
+               if (err_printed)
+                       tx->tx_state |= GNILND_TX_QUIET_ERROR;
+               list_del_init(&tx->tx_list);
+               kgnilnd_tx_done(tx, error);
+               err_printed++;
+       }
+}
+int
+kgnilnd_set_tx_id(kgn_tx_t *tx, kgn_conn_t *conn)
+{
+       int     id;
+
+       spin_lock(&conn->gnc_tx_lock);
+
+       /* ID zero is NOT ALLOWED!!! */
+
+search_again:
+       id = find_next_zero_bit((unsigned long *)&conn->gnc_tx_bits,
+                                GNILND_MAX_MSG_ID, conn->gnc_next_tx);
+       if (id == GNILND_MAX_MSG_ID) {
+               if (conn->gnc_next_tx != 1) {
+                       /* we only searched from next_tx to end and didn't find
+                        * one, so search again from start */
+                       conn->gnc_next_tx = 1;
+                       goto search_again;
+               }
+               /* couldn't find one! */
+               spin_unlock(&conn->gnc_tx_lock);
+               return -E2BIG;
+       }
+
+       /* bump next_tx to prevent immediate reuse */
+       conn->gnc_next_tx = id + 1;
+
+       set_bit(id, (volatile unsigned long *)&conn->gnc_tx_bits);
+       LASSERTF(conn->gnc_tx_ref_table[id] == NULL,
+                "tx 0x%p already at id %d\n",
+                conn->gnc_tx_ref_table[id], id);
+
+       /* delay these until we have a valid ID - prevents bad clear of the bit
+        * in kgnilnd_tx_done */
+       tx->tx_conn = conn;
+       tx->tx_id.txe_cqid = conn->gnc_cqid;
+
+       tx->tx_id.txe_idx = id;
+       conn->gnc_tx_ref_table[id] = tx;
+
+       /* Using jiffies to help differentiate against TX reuse - with
+        * the usual minimum of a 250HZ clock, we wrap jiffies on the same TX
+        * if we are sending to the same node faster than 256000/sec.
+        * To help guard against this, we OR in the tx_seq - that is 32 bits */
+
+       tx->tx_id.txe_chips = (__u32)(jiffies | conn->gnc_tx_seq);
+
+       GNIDBG_TX(D_NET, tx, "set cookie/id/bits", NULL);
+
+       spin_unlock(&conn->gnc_tx_lock);
+       return 0;
+}
+
+static inline int
+kgnilnd_tx_should_retry(kgn_conn_t *conn, kgn_tx_t *tx)
+{
+       int             max_retrans = *kgnilnd_tunables.kgn_max_retransmits;
+       int             log_retrans;
+       int             log_retrans_level;
+
+       /* I need kgni credits to send this.  Replace tx at the head of the
+        * fmaq and I'll get rescheduled when credits appear */
+       tx->tx_state = 0;
+       tx->tx_retrans++;
+       conn->gnc_tx_retrans++;
+       log_retrans = ((tx->tx_retrans < 25) || ((tx->tx_retrans % 25) == 0) ||
+                       (tx->tx_retrans > (max_retrans / 2)));
+       log_retrans_level = tx->tx_retrans < (max_retrans / 2) ? D_NET : D_NETERROR;
+
+       /* Decision time - either error, warn or just retransmit */
+
+       /* we don't care about TX timeout - it could be that the network is slower
+        * or throttled. We'll keep retranmitting - so if the network is so slow
+        * that we fill up our mailbox, we'll keep trying to resend that msg
+        * until we exceed the max_retrans _or_ gnc_last_rx expires, indicating
+        * that he hasn't send us any traffic in return */
+
+       if (tx->tx_retrans > max_retrans) {
+               /* this means we are not backing off the retransmits
+                * in a healthy manner and are likely chewing up the
+                * CPU cycles quite badly */
+               GNIDBG_TOMSG(D_ERROR, &tx->tx_msg,
+                       "SOFTWARE BUG: too many retransmits (%d) for tx id %x "
+                       "conn 0x%p->%s\n",
+                       tx->tx_retrans, tx->tx_id, conn,
+                       libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+               /* yes - double errors to help debug this condition */
+          &n